wordless/wl_colligation_extractor.py
# ----------------------------------------------------------------------
# Wordless: Colligation Extractor
# Copyright (C) 2018-2024 Ye Lei (叶磊)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------
# pylint: disable=broad-exception-caught
import bisect
import collections
import copy
import operator
import re
import traceback
import numpy
from PyQt5.QtCore import pyqtSignal, QCoreApplication, Qt
from PyQt5.QtWidgets import QLabel, QGroupBox
from wordless.wl_checks import wl_checks_work_area
from wordless.wl_dialogs import wl_dialogs_misc
from wordless.wl_figs import wl_figs, wl_figs_freqs, wl_figs_stats
from wordless.wl_nlp import (
wl_matching,
wl_nlp_utils,
wl_texts,
wl_token_processing
)
from wordless.wl_utils import wl_misc, wl_sorting, wl_threading
from wordless.wl_widgets import (
wl_boxes,
wl_layouts,
wl_tables,
wl_widgets
)
_tr = QCoreApplication.translate
class Wrapper_Colligation_Extractor(wl_layouts.Wl_Wrapper):
def __init__(self, main):
super().__init__(main)
# Table
self.table_colligation_extractor = Wl_Table_Colligation_Extractor(self)
layout_results = wl_layouts.Wl_Layout()
layout_results.addWidget(self.table_colligation_extractor.label_num_results, 0, 0)
layout_results.addWidget(self.table_colligation_extractor.button_results_filter, 0, 2)
layout_results.addWidget(self.table_colligation_extractor.button_results_search, 0, 3)
layout_results.setColumnStretch(1, 1)
self.wrapper_table.layout().addLayout(layout_results, 0, 0, 1, 5)
self.wrapper_table.layout().addWidget(self.table_colligation_extractor, 1, 0, 1, 5)
self.wrapper_table.layout().addWidget(self.table_colligation_extractor.button_generate_table, 2, 0)
self.wrapper_table.layout().addWidget(self.table_colligation_extractor.button_generate_fig, 2, 1)
self.wrapper_table.layout().addWidget(self.table_colligation_extractor.button_exp_selected_cells, 2, 2)
self.wrapper_table.layout().addWidget(self.table_colligation_extractor.button_exp_all_cells, 2, 3)
self.wrapper_table.layout().addWidget(self.table_colligation_extractor.button_clr_table, 2, 4)
# Token Settings
self.group_box_token_settings = QGroupBox(self.tr('Token Settings'), self)
(
self.checkbox_words,
self.checkbox_all_lowercase,
self.checkbox_all_uppercase,
self.checkbox_title_case,
self.checkbox_nums,
self.checkbox_punc_marks,
self.checkbox_treat_as_all_lowercase,
self.checkbox_apply_lemmatization,
self.checkbox_filter_stop_words,
self.checkbox_assign_pos_tags,
self.checkbox_ignore_tags,
self.checkbox_use_tags
) = wl_widgets.wl_widgets_token_settings(self)
self.checkbox_assign_pos_tags.hide()
self.checkbox_words.stateChanged.connect(self.token_settings_changed)
self.checkbox_all_lowercase.stateChanged.connect(self.token_settings_changed)
self.checkbox_all_uppercase.stateChanged.connect(self.token_settings_changed)
self.checkbox_title_case.stateChanged.connect(self.token_settings_changed)
self.checkbox_nums.stateChanged.connect(self.token_settings_changed)
self.checkbox_punc_marks.stateChanged.connect(self.token_settings_changed)
self.checkbox_treat_as_all_lowercase.stateChanged.connect(self.token_settings_changed)
self.checkbox_apply_lemmatization.stateChanged.connect(self.token_settings_changed)
self.checkbox_filter_stop_words.stateChanged.connect(self.token_settings_changed)
self.checkbox_ignore_tags.stateChanged.connect(self.token_settings_changed)
self.checkbox_use_tags.stateChanged.connect(self.token_settings_changed)
self.group_box_token_settings.setLayout(wl_layouts.Wl_Layout())
self.group_box_token_settings.layout().addWidget(self.checkbox_words, 0, 0)
self.group_box_token_settings.layout().addWidget(self.checkbox_all_lowercase, 0, 1)
self.group_box_token_settings.layout().addWidget(self.checkbox_all_uppercase, 1, 0)
self.group_box_token_settings.layout().addWidget(self.checkbox_title_case, 1, 1)
self.group_box_token_settings.layout().addWidget(self.checkbox_nums, 2, 0)
self.group_box_token_settings.layout().addWidget(self.checkbox_punc_marks, 2, 1)
self.group_box_token_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 3, 0, 1, 2)
self.group_box_token_settings.layout().addWidget(self.checkbox_treat_as_all_lowercase, 4, 0, 1, 2)
self.group_box_token_settings.layout().addWidget(self.checkbox_apply_lemmatization, 5, 0, 1, 2)
self.group_box_token_settings.layout().addWidget(self.checkbox_filter_stop_words, 6, 0, 1, 2)
self.group_box_token_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 7, 0, 1, 2)
self.group_box_token_settings.layout().addWidget(self.checkbox_ignore_tags, 8, 0)
self.group_box_token_settings.layout().addWidget(self.checkbox_use_tags, 8, 1)
# Search Settings
self.group_box_search_settings = QGroupBox(self.tr('Search Settings'), self)
(
self.label_search_term,
self.checkbox_multi_search_mode,
self.stacked_widget_search_term,
self.line_edit_search_term,
self.list_search_terms,
self.label_delimiter,
self.checkbox_match_case,
self.checkbox_match_whole_words,
self.checkbox_match_inflected_forms,
self.checkbox_use_regex,
self.checkbox_match_without_tags,
self.checkbox_match_tags
) = wl_widgets.wl_widgets_search_settings(
self,
tab = 'colligation_extractor'
)
(
self.label_context_settings,
self.button_context_settings
) = wl_widgets.wl_widgets_context_settings(
self,
tab = 'colligation_extractor'
)
self.checkbox_multi_search_mode.stateChanged.connect(self.search_settings_changed)
self.line_edit_search_term.textChanged.connect(self.search_settings_changed)
self.line_edit_search_term.returnPressed.connect(self.table_colligation_extractor.button_generate_table.click)
self.list_search_terms.model().dataChanged.connect(self.search_settings_changed)
self.checkbox_match_case.stateChanged.connect(self.search_settings_changed)
self.checkbox_match_whole_words.stateChanged.connect(self.search_settings_changed)
self.checkbox_match_inflected_forms.stateChanged.connect(self.search_settings_changed)
self.checkbox_use_regex.stateChanged.connect(self.search_settings_changed)
self.checkbox_match_without_tags.stateChanged.connect(self.search_settings_changed)
self.checkbox_match_tags.stateChanged.connect(self.search_settings_changed)
layout_context_settings = wl_layouts.Wl_Layout()
layout_context_settings.addWidget(self.label_context_settings, 0, 0)
layout_context_settings.addWidget(self.button_context_settings, 0, 1)
layout_context_settings.setColumnStretch(1, 1)
self.group_box_search_settings.setLayout(wl_layouts.Wl_Layout())
self.group_box_search_settings.layout().addWidget(self.label_search_term, 0, 0)
self.group_box_search_settings.layout().addWidget(self.checkbox_multi_search_mode, 0, 1, Qt.AlignRight)
self.group_box_search_settings.layout().addWidget(self.stacked_widget_search_term, 1, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.label_delimiter, 2, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.checkbox_match_case, 3, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.checkbox_match_whole_words, 4, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.checkbox_match_inflected_forms, 5, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.checkbox_use_regex, 6, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.checkbox_match_without_tags, 7, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(self.checkbox_match_tags, 8, 0, 1, 2)
self.group_box_search_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 9, 0, 1, 2)
self.group_box_search_settings.layout().addLayout(layout_context_settings, 10, 0, 1, 2)
# Generation Settings
self.group_box_generation_settings = QGroupBox(self.tr('Generation Settings'))
self.label_window = QLabel(self.tr('Collocational window:'), self)
(
self.checkbox_window_sync,
self.label_window_left,
self.spin_box_window_left,
self.label_window_right,
self.spin_box_window_right
) = wl_boxes.wl_spin_boxes_min_max_sync_window(self)
self.label_limit_searching = QLabel(self.tr('Limit searching:'), self)
self.combo_box_limit_searching = wl_boxes.Wl_Combo_Box(self)
(
self.label_test_statistical_significance,
self.combo_box_test_statistical_significance,
self.label_measure_bayes_factor,
self.combo_box_measure_bayes_factor,
self.label_measure_effect_size,
self.combo_box_measure_effect_size
) = wl_widgets.wl_widgets_measures_collocation_extractor(self, tab = 'collocation_extractor')
self.combo_box_limit_searching.addItems([
self.tr('None'),
self.tr('Within sentence segments'),
self.tr('Within sentences'),
self.tr('Within paragraphs')
])
self.checkbox_window_sync.stateChanged.connect(self.generation_settings_changed)
self.spin_box_window_left.valueChanged.connect(self.generation_settings_changed)
self.spin_box_window_right.valueChanged.connect(self.generation_settings_changed)
self.combo_box_limit_searching.currentTextChanged.connect(self.generation_settings_changed)
self.combo_box_test_statistical_significance.currentTextChanged.connect(self.generation_settings_changed)
self.combo_box_measure_bayes_factor.currentTextChanged.connect(self.generation_settings_changed)
self.combo_box_measure_effect_size.currentTextChanged.connect(self.generation_settings_changed)
layout_settings_limit_searching = wl_layouts.Wl_Layout()
layout_settings_limit_searching.addWidget(self.label_limit_searching, 0, 0)
layout_settings_limit_searching.addWidget(self.combo_box_limit_searching, 0, 1)
layout_settings_limit_searching.setColumnStretch(1, 1)
self.group_box_generation_settings.setLayout(wl_layouts.Wl_Layout())
self.group_box_generation_settings.layout().addWidget(self.label_window, 0, 0, 1, 3)
self.group_box_generation_settings.layout().addWidget(self.checkbox_window_sync, 0, 3, Qt.AlignRight)
self.group_box_generation_settings.layout().addWidget(self.label_window_left, 1, 0)
self.group_box_generation_settings.layout().addWidget(self.spin_box_window_left, 1, 1)
self.group_box_generation_settings.layout().addWidget(self.label_window_right, 1, 2)
self.group_box_generation_settings.layout().addWidget(self.spin_box_window_right, 1, 3)
self.group_box_generation_settings.layout().addLayout(layout_settings_limit_searching, 2, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 3, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(self.label_test_statistical_significance, 4, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(self.combo_box_test_statistical_significance, 5, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(self.label_measure_bayes_factor, 6, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(self.combo_box_measure_bayes_factor, 7, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(self.label_measure_effect_size, 8, 0, 1, 4)
self.group_box_generation_settings.layout().addWidget(self.combo_box_measure_effect_size, 9, 0, 1, 4)
self.group_box_generation_settings.layout().setColumnStretch(1, 1)
self.group_box_generation_settings.layout().setColumnStretch(3, 1)
# Table Settings
self.group_box_table_settings = QGroupBox(self.tr('Table Settings'))
(
self.checkbox_show_pct_data,
self.checkbox_show_cum_data,
self.checkbox_show_breakdown_span_position,
self.checkbox_show_breakdown_file
) = wl_widgets.wl_widgets_table_settings_span_position(
self,
tables = [self.table_colligation_extractor]
)
self.checkbox_show_pct_data.stateChanged.connect(self.table_settings_changed)
self.checkbox_show_cum_data.stateChanged.connect(self.table_settings_changed)
self.checkbox_show_breakdown_span_position.stateChanged.connect(self.table_settings_changed)
self.checkbox_show_breakdown_file.stateChanged.connect(self.table_settings_changed)
self.group_box_table_settings.setLayout(wl_layouts.Wl_Layout())
self.group_box_table_settings.layout().addWidget(self.checkbox_show_pct_data, 0, 0)
self.group_box_table_settings.layout().addWidget(self.checkbox_show_cum_data, 1, 0)
self.group_box_table_settings.layout().addWidget(self.checkbox_show_breakdown_span_position, 2, 0)
self.group_box_table_settings.layout().addWidget(self.checkbox_show_breakdown_file, 3, 0)
# Figure Settings
self.group_box_fig_settings = QGroupBox(self.tr('Figure Settings'), self)
(
self.label_graph_type,
self.combo_box_graph_type,
self.label_sort_by_file,
self.combo_box_sort_by_file,
self.label_use_data,
self.combo_box_use_data,
self.checkbox_use_pct,
self.checkbox_use_cumulative
) = wl_widgets.wl_widgets_fig_settings(self, tab = 'colligation_extractor')
self.label_rank = QLabel(self.tr('Rank:'), self)
(
self.checkbox_rank_sync,
self.label_rank_min,
self.spin_box_rank_min,
self.checkbox_rank_min_no_limit,
self.label_rank_max,
self.spin_box_rank_max,
self.checkbox_rank_max_no_limit
) = wl_boxes.wl_spin_boxes_min_max_no_limit(
self,
val_min = 1,
val_max = 100000
)
self.combo_box_graph_type.currentTextChanged.connect(self.fig_settings_changed)
self.combo_box_sort_by_file.currentTextChanged.connect(self.fig_settings_changed)
self.combo_box_use_data.currentTextChanged.connect(self.fig_settings_changed)
self.checkbox_use_pct.stateChanged.connect(self.fig_settings_changed)
self.checkbox_use_cumulative.stateChanged.connect(self.fig_settings_changed)
self.spin_box_rank_min.valueChanged.connect(self.fig_settings_changed)
self.checkbox_rank_min_no_limit.stateChanged.connect(self.fig_settings_changed)
self.spin_box_rank_max.valueChanged.connect(self.fig_settings_changed)
self.checkbox_rank_max_no_limit.stateChanged.connect(self.fig_settings_changed)
layout_fig_settings_combo_boxes = wl_layouts.Wl_Layout()
layout_fig_settings_combo_boxes.addWidget(self.label_graph_type, 0, 0)
layout_fig_settings_combo_boxes.addWidget(self.combo_box_graph_type, 0, 1)
layout_fig_settings_combo_boxes.addWidget(self.label_sort_by_file, 1, 0)
layout_fig_settings_combo_boxes.addWidget(self.combo_box_sort_by_file, 1, 1)
layout_fig_settings_combo_boxes.addWidget(self.label_use_data, 2, 0)
layout_fig_settings_combo_boxes.addWidget(self.combo_box_use_data, 2, 1)
layout_fig_settings_combo_boxes.setColumnStretch(1, 1)
self.group_box_fig_settings.setLayout(wl_layouts.Wl_Layout())
self.group_box_fig_settings.layout().addLayout(layout_fig_settings_combo_boxes, 0, 0, 1, 3)
self.group_box_fig_settings.layout().addWidget(self.checkbox_use_pct, 1, 0, 1, 3)
self.group_box_fig_settings.layout().addWidget(self.checkbox_use_cumulative, 2, 0, 1, 3)
self.group_box_fig_settings.layout().addWidget(wl_layouts.Wl_Separator(self), 3, 0, 1, 3)
self.group_box_fig_settings.layout().addWidget(self.label_rank, 4, 0, 1, 2)
self.group_box_fig_settings.layout().addWidget(self.checkbox_rank_sync, 4, 2)
self.group_box_fig_settings.layout().addWidget(self.label_rank_min, 5, 0)
self.group_box_fig_settings.layout().addWidget(self.spin_box_rank_min, 5, 1)
self.group_box_fig_settings.layout().addWidget(self.checkbox_rank_min_no_limit, 5, 2)
self.group_box_fig_settings.layout().addWidget(self.label_rank_max, 6, 0)
self.group_box_fig_settings.layout().addWidget(self.spin_box_rank_max, 6, 1)
self.group_box_fig_settings.layout().addWidget(self.checkbox_rank_max_no_limit, 6, 2)
self.group_box_fig_settings.layout().setColumnStretch(1, 1)
self.wrapper_settings.layout().addWidget(self.group_box_token_settings, 0, 0)
self.wrapper_settings.layout().addWidget(self.group_box_search_settings, 1, 0)
self.wrapper_settings.layout().addWidget(self.group_box_generation_settings, 2, 0)
self.wrapper_settings.layout().addWidget(self.group_box_table_settings, 3, 0)
self.wrapper_settings.layout().addWidget(self.group_box_fig_settings, 4, 0)
self.load_settings()
def load_settings(self, defaults = False):
if defaults:
settings = copy.deepcopy(self.main.settings_default['colligation_extractor'])
else:
settings = copy.deepcopy(self.main.settings_custom['colligation_extractor'])
# Token Settings
self.checkbox_words.setChecked(settings['token_settings']['words'])
self.checkbox_all_lowercase.setChecked(settings['token_settings']['all_lowercase'])
self.checkbox_all_uppercase.setChecked(settings['token_settings']['all_uppercase'])
self.checkbox_title_case.setChecked(settings['token_settings']['title_case'])
self.checkbox_nums.setChecked(settings['token_settings']['nums'])
self.checkbox_punc_marks.setChecked(settings['token_settings']['punc_marks'])
self.checkbox_treat_as_all_lowercase.setChecked(settings['token_settings']['treat_as_all_lowercase'])
self.checkbox_apply_lemmatization.setChecked(settings['token_settings']['apply_lemmatization'])
self.checkbox_filter_stop_words.setChecked(settings['token_settings']['filter_stop_words'])
self.checkbox_ignore_tags.setChecked(settings['token_settings']['ignore_tags'])
self.checkbox_use_tags.setChecked(settings['token_settings']['use_tags'])
# Search Settings
self.checkbox_multi_search_mode.setChecked(settings['search_settings']['multi_search_mode'])
if not defaults:
self.line_edit_search_term.setText(settings['search_settings']['search_term'])
self.list_search_terms.load_items(settings['search_settings']['search_terms'])
self.checkbox_match_case.setChecked(settings['search_settings']['match_case'])
self.checkbox_match_whole_words.setChecked(settings['search_settings']['match_whole_words'])
self.checkbox_match_inflected_forms.setChecked(settings['search_settings']['match_inflected_forms'])
self.checkbox_use_regex.setChecked(settings['search_settings']['use_regex'])
self.checkbox_match_without_tags.setChecked(settings['search_settings']['match_without_tags'])
self.checkbox_match_tags.setChecked(settings['search_settings']['match_tags'])
# Context Settings
if defaults:
self.main.wl_context_settings_colligation_extractor.load_settings(defaults = True)
# Generation Settings
self.checkbox_window_sync.setChecked(settings['generation_settings']['window_sync'])
if settings['generation_settings']['window_left'] < 0:
self.spin_box_window_left.setPrefix(self.tr('L'))
self.spin_box_window_left.setValue(-settings['generation_settings']['window_left'])
else:
self.spin_box_window_left.setPrefix(self.tr('R'))
self.spin_box_window_left.setValue(settings['generation_settings']['window_left'])
if settings['generation_settings']['window_right'] < 0:
self.spin_box_window_right.setPrefix(self.tr('L'))
self.spin_box_window_right.setValue(-settings['generation_settings']['window_right'])
else:
self.spin_box_window_right.setPrefix(self.tr('R'))
self.spin_box_window_right.setValue(settings['generation_settings']['window_right'])
self.combo_box_limit_searching.setCurrentText(settings['generation_settings']['limit_searching'])
self.combo_box_test_statistical_significance.set_measure(settings['generation_settings']['test_statistical_significance'])
self.combo_box_measure_bayes_factor.set_measure(settings['generation_settings']['measure_bayes_factor'])
self.combo_box_measure_effect_size.set_measure(settings['generation_settings']['measure_effect_size'])
# Table Settings
self.checkbox_show_pct_data.setChecked(settings['table_settings']['show_pct_data'])
self.checkbox_show_cum_data.setChecked(settings['table_settings']['show_cum_data'])
self.checkbox_show_breakdown_span_position.setChecked(settings['table_settings']['show_breakdown_span_position'])
self.checkbox_show_breakdown_file.setChecked(settings['table_settings']['show_breakdown_file'])
# Figure Settings
self.combo_box_graph_type.setCurrentText(settings['fig_settings']['graph_type'])
self.combo_box_sort_by_file.setCurrentText(settings['fig_settings']['sort_by_file'])
self.combo_box_use_data.setCurrentText(settings['fig_settings']['use_data'])
self.checkbox_use_pct.setChecked(settings['fig_settings']['use_pct'])
self.checkbox_use_cumulative.setChecked(settings['fig_settings']['use_cumulative'])
self.spin_box_rank_min.setValue(settings['fig_settings']['rank_min'])
self.checkbox_rank_min_no_limit.setChecked(settings['fig_settings']['rank_min_no_limit'])
self.spin_box_rank_max.setValue(settings['fig_settings']['rank_max'])
self.checkbox_rank_max_no_limit.setChecked(settings['fig_settings']['rank_max_no_limit'])
self.token_settings_changed()
self.search_settings_changed()
self.generation_settings_changed()
self.table_settings_changed()
self.fig_settings_changed()
def token_settings_changed(self):
settings = self.main.settings_custom['colligation_extractor']['token_settings']
settings['words'] = self.checkbox_words.isChecked()
settings['all_lowercase'] = self.checkbox_all_lowercase.isChecked()
settings['all_uppercase'] = self.checkbox_all_uppercase.isChecked()
settings['title_case'] = self.checkbox_title_case.isChecked()
settings['nums'] = self.checkbox_nums.isChecked()
settings['punc_marks'] = self.checkbox_punc_marks.isChecked()
settings['treat_as_all_lowercase'] = self.checkbox_treat_as_all_lowercase.isChecked()
settings['apply_lemmatization'] = self.checkbox_apply_lemmatization.isChecked()
settings['filter_stop_words'] = self.checkbox_filter_stop_words.isChecked()
settings['ignore_tags'] = self.checkbox_ignore_tags.isChecked()
settings['use_tags'] = self.checkbox_use_tags.isChecked()
self.checkbox_match_tags.token_settings_changed()
self.main.wl_context_settings_colligation_extractor.token_settings_changed()
def search_settings_changed(self):
settings = self.main.settings_custom['colligation_extractor']['search_settings']
settings['multi_search_mode'] = self.checkbox_multi_search_mode.isChecked()
settings['search_term'] = self.line_edit_search_term.text()
settings['search_terms'] = self.list_search_terms.model().stringList()
settings['match_case'] = self.checkbox_match_case.isChecked()
settings['match_whole_words'] = self.checkbox_match_whole_words.isChecked()
settings['match_inflected_forms'] = self.checkbox_match_inflected_forms.isChecked()
settings['use_regex'] = self.checkbox_use_regex.isChecked()
settings['match_without_tags'] = self.checkbox_match_without_tags.isChecked()
settings['match_tags'] = self.checkbox_match_tags.isChecked()
def generation_settings_changed(self):
settings = self.main.settings_custom['colligation_extractor']['generation_settings']
settings['window_sync'] = self.checkbox_window_sync.isChecked()
if self.spin_box_window_left.prefix() == self.tr('L'):
settings['window_left'] = - self.spin_box_window_left.value()
else:
settings['window_left'] = self.spin_box_window_left.value()
if self.spin_box_window_right.prefix() == self.tr('L'):
settings['window_right'] = - self.spin_box_window_right.value()
else:
settings['window_right'] = self.spin_box_window_right.value()
settings['limit_searching'] = self.combo_box_limit_searching.currentText()
settings['test_statistical_significance'] = self.combo_box_test_statistical_significance.get_measure()
settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure()
settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure()
# Use Data
self.combo_box_use_data.measures_changed()
def table_settings_changed(self):
settings = self.main.settings_custom['colligation_extractor']['table_settings']
settings['show_pct_data'] = self.checkbox_show_pct_data.isChecked()
settings['show_cum_data'] = self.checkbox_show_cum_data.isChecked()
settings['show_breakdown_span_position'] = self.checkbox_show_breakdown_span_position.isChecked()
settings['show_breakdown_file'] = self.checkbox_show_breakdown_file.isChecked()
def fig_settings_changed(self):
settings = self.main.settings_custom['colligation_extractor']['fig_settings']
settings['graph_type'] = self.combo_box_graph_type.currentText()
settings['sort_by_file'] = self.combo_box_sort_by_file.currentText()
settings['use_data'] = self.combo_box_use_data.currentText()
settings['use_pct'] = self.checkbox_use_pct.isChecked()
settings['use_cumulative'] = self.checkbox_use_cumulative.isChecked()
settings['rank_min'] = self.spin_box_rank_min.value()
settings['rank_min_no_limit'] = self.checkbox_rank_min_no_limit.isChecked()
settings['rank_max'] = self.spin_box_rank_max.value()
settings['rank_max_no_limit'] = self.checkbox_rank_max_no_limit.isChecked()
class Wl_Table_Colligation_Extractor(wl_tables.Wl_Table_Data_Filter_Search):
def __init__(self, parent):
super().__init__(
parent,
tab = 'colligation_extractor',
headers = [
_tr('Wl_Table_Colligation_Extractor', 'Rank'),
_tr('Wl_Table_Colligation_Extractor', 'Node'),
_tr('Wl_Table_Colligation_Extractor', 'Collocate'),
_tr('Wl_Table_Colligation_Extractor', 'Number of\nFiles Found'),
_tr('Wl_Table_Colligation_Extractor', 'Number of\nFiles Found %')
],
headers_int = [
_tr('Wl_Table_Colligation_Extractor', 'Rank'),
_tr('Wl_Table_Colligation_Extractor', 'Number of\nFiles Found')
],
headers_pct = [
_tr('Wl_Table_Colligation_Extractor', 'Number of\nFiles Found %')
],
enable_sorting = True
)
self.wrapper = parent
@wl_misc.log_time
def generate_table(self):
if (
wl_checks_work_area.check_search_terms(
self.main,
search_settings = self.main.settings_custom['colligation_extractor']['search_settings']
) and wl_checks_work_area.check_nlp_support(
self.main,
nlp_utils = ['pos_taggers']
)
):
worker_colligation_extractor_table = Wl_Worker_Colligation_Extractor_Table(
self.main,
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(self.main),
update_gui = self.update_gui_table
)
wl_threading.Wl_Thread(worker_colligation_extractor_table).start_worker()
def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats_files):
if wl_checks_work_area.check_results(self.main, err_msg, colligations_freqs_files):
try:
self.settings = copy.deepcopy(self.main.settings_custom)
self.clr_table()
settings = self.main.settings_custom['colligation_extractor']
test_statistical_significance = settings['generation_settings']['test_statistical_significance']
measure_bayes_factor = settings['generation_settings']['measure_bayes_factor']
measure_effect_size = settings['generation_settings']['measure_effect_size']
col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text']
col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text']
# Insert columns
files = list(self.main.wl_file_area.get_selected_files())
files_with_total = files + [{'name': self.tr('Total')}]
for file in files_with_total:
if file['name'] == self.tr('Total'):
is_breakdown_file = False
else:
is_breakdown_file = True
for i in range(
settings['generation_settings']['window_left'],
settings['generation_settings']['window_right'] + 1
):
if i < 0:
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nL{}').format(file['name'], -i),
is_int = True, is_cum = True,
is_breakdown_file = is_breakdown_file, is_breakdown_span_position = True
)
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nL{} %').format(file['name'], -i),
is_pct = True, is_cum = True,
is_breakdown_file = is_breakdown_file, is_breakdown_span_position = True
)
elif i > 0:
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nR{}').format(file['name'], i),
is_int = True, is_cum = True,
is_breakdown_file = is_breakdown_file, is_breakdown_span_position = True
)
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nR{} %').format(file['name'], i),
is_pct = True, is_cum = True,
is_breakdown_file = is_breakdown_file, is_breakdown_span_position = True
)
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nFrequency').format(file['name']),
is_int = True, is_cum = True,
is_breakdown_file = is_breakdown_file
)
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nFrequency %').format(file['name']),
is_pct = True, is_cum = True,
is_breakdown_file = is_breakdown_file
)
if test_statistical_significance != 'none':
if col_text_test_stat:
self.ins_header_hor(
self.model().columnCount() - 2,
f'[{file["name"]}]\n{col_text_test_stat}',
is_float = True,
is_breakdown_file = is_breakdown_file
)
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\np-value').format(file['name']),
is_float = True,
is_breakdown_file = is_breakdown_file
)
if measure_bayes_factor != 'none':
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[{}]\nBayes Factor').format(file['name']),
is_float = True,
is_breakdown_file = is_breakdown_file
)
if measure_effect_size != 'none':
self.ins_header_hor(
self.model().columnCount() - 2,
f'[{file["name"]}]\n{col_text_effect_size}',
is_float = True,
is_breakdown_file = is_breakdown_file
)
# Sort by p-value of the first file
if test_statistical_significance != 'none':
self.horizontalHeader().setSortIndicator(
self.find_header_hor(self.tr('[{}]\np-value').format(files[0]['name'])),
Qt.AscendingOrder
)
# Sort by bayes factor of the first file
elif measure_bayes_factor != 'none':
self.horizontalHeader().setSortIndicator(
self.find_header_hor(self.tr('[{}]\nBayes Factor').format(files[0]['name'])),
Qt.DescendingOrder
)
# Sort by effect size of the first file
elif measure_effect_size != 'none':
self.horizontalHeader().setSortIndicator(
self.find_header_hor(f"[{files[0]['name']}]\n{col_text_effect_size}"),
Qt.DescendingOrder
)
# Otherwise sort by frequency of the first file
else:
self.horizontalHeader().setSortIndicator(
self.find_header_hor(self.tr('[{}]\nFrequency').format(files[0]['name'])),
Qt.DescendingOrder
)
if settings['generation_settings']['window_left'] < 0:
cols_freqs_start = [
self.find_header_hor(self.tr('[{}]\nL{}').format(file['name'], -settings['generation_settings']['window_left']))
for file in files_with_total
]
else:
cols_freqs_start = [
self.find_header_hor(self.tr('[{}]\nR{}').format(file['name'], settings['generation_settings']['window_left']))
for file in files_with_total
]
cols_freq = self.find_headers_hor(self.tr('\nFrequency'))
cols_freq_pct = self.find_headers_hor(self.tr('\nFrequency %'))
for col in cols_freq_pct:
cols_freq.remove(col)
cols_test_stat = self.find_headers_hor(f'\n{col_text_test_stat}')
cols_p_val = self.find_headers_hor(self.tr('\np-value'))
cols_bayes_factor = self.find_headers_hor(self.tr('\nBayes Factor'))
cols_effect_size = self.find_headers_hor(f'\n{col_text_effect_size}')
col_files_found = self.find_header_hor(self.tr('Number of\nFiles Found'))
col_files_found_pct = self.find_header_hor(self.tr('Number of\nFiles Found %'))
freqs_totals = numpy.array(list(colligations_freqs_files.values())).sum(axis = 0)
freq_totals = numpy.array(list(colligations_freqs_files.values())).sum(axis = 2).sum(axis = 0)
len_files = len(files)
self.model().setRowCount(len(colligations_freqs_files))
self.disable_updates()
for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(colligations_stats_files)):
freqs_files = colligations_freqs_files[(node, collocate)]
# Rank
self.set_item_num(i, 0, -1)
# Node
self.model().setItem(i, 1, wl_tables.Wl_Table_Item(' '.join(wl_texts.to_display_texts(node))))
self.model().item(i, 1).tokens_filter = node
# Collocate
self.model().setItem(i, 2, wl_tables.Wl_Table_Item(collocate.display_text()))
self.model().item(i, 2).tokens_filter = [collocate]
# Frequency
for j, freqs_file in enumerate(freqs_files):
for k, freq in enumerate(freqs_file):
self.set_item_num(i, cols_freqs_start[j] + k * 2, freq)
if freqs_totals[j][k]:
self.set_item_num(i, cols_freqs_start[j] + k * 2 + 1, freq / freqs_totals[j][k])
else:
self.set_item_num(i, cols_freqs_start[j] + k * 2 + 1, 0)
self.set_item_num(i, cols_freq[j], sum(freqs_file))
if freq_totals[j]:
self.set_item_num(i, cols_freq_pct[j], sum(freqs_file) / freq_totals[j])
else:
self.set_item_num(i, cols_freq_pct[j], 0)
for j, (test_stat, p_val, bayes_factor, effect_size) in enumerate(stats_files):
# Test Statistic
if test_stat is not None:
self.set_item_num(i, cols_test_stat[j], test_stat)
# p-value
if p_val is not None:
self.set_item_p_val(i, cols_p_val[j], p_val)
# Bayes Factor
if bayes_factor is not None:
self.set_item_num(i, cols_bayes_factor[j], bayes_factor)
# Effect Size
if effect_size is not None:
self.set_item_num(i, cols_effect_size[j], effect_size)
# Number of Files Found
num_files_found = len([freqs_file for freqs_file in freqs_files[:-1] if sum(freqs_file)])
self.set_item_num(i, col_files_found, num_files_found)
self.set_item_num(i, col_files_found_pct, num_files_found / len_files)
self.enable_updates()
self.toggle_pct_data_span_position()
self.toggle_cum_data()
self.toggle_breakdown_span_position()
self.toggle_breakdown_file_span_position()
self.update_ranks()
except Exception:
err_msg = traceback.format_exc()
finally:
wl_checks_work_area.check_err_table(self.main, err_msg)
@wl_misc.log_time
def generate_fig(self):
if (
wl_checks_work_area.check_search_terms(
self.main,
search_settings = self.main.settings_custom['colligation_extractor']['search_settings']
) and wl_checks_work_area.check_nlp_support(
self.main,
nlp_utils = ['pos_taggers']
)
):
self.worker_colligation_extractor_fig = Wl_Worker_Colligation_Extractor_Fig(
self.main,
dialog_progress = wl_dialogs_misc.Wl_Dialog_Progress_Process_Data(self.main),
update_gui = self.update_gui_fig
)
wl_threading.Wl_Thread(self.worker_colligation_extractor_fig).start_worker()
def update_gui_fig(self, err_msg, colligations_freqs_file, colligations_stats_files):
if wl_checks_work_area.check_results(self.main, err_msg, colligations_freqs_file):
try:
settings = self.main.settings_custom['colligation_extractor']
test_statistical_significance = settings['generation_settings']['test_statistical_significance']
measure_effect_size = settings['generation_settings']['measure_effect_size']
col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text']
col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text']
if re.search(self.tr(r'^[LR][0-9]+$'), settings['fig_settings']['use_data']):
span_positions = (
list(range(settings['generation_settings']['window_left'], 0))
+ list(range(1, settings['generation_settings']['window_right'] + 1))
)
if self.tr('L') in settings['fig_settings']['use_data']:
span_position = span_positions.index(-int(settings['fig_settings']['use_data'][1:]))
else:
span_position = span_positions.index(int(settings['fig_settings']['use_data'][1:]))
collocates_freq_files = {
colligation: numpy.array(freqs)[:, span_position]
for colligation, freqs in colligations_freqs_file.items()
}
wl_figs_freqs.wl_fig_freqs(
self.main, collocates_freq_files,
tab = 'colligation_extractor'
)
elif settings['fig_settings']['use_data'] == self.tr('Frequency'):
collocates_freq_files = {
colligation: numpy.array(freqs).sum(axis = 1)
for colligation, freqs in colligations_freqs_file.items()
}
wl_figs_freqs.wl_fig_freqs(
self.main, collocates_freq_files,
tab = 'colligation_extractor'
)
else:
if settings['fig_settings']['use_data'] == col_text_test_stat:
collocates_stat_files = {
colligation: numpy.array(stats_files)[:, 0]
for colligation, stats_files in colligations_stats_files.items()
}
elif settings['fig_settings']['use_data'] == self.tr('p-value'):
collocates_stat_files = {
colligation: numpy.array(stats_files)[:, 1]
for colligation, stats_files in colligations_stats_files.items()
}
elif settings['fig_settings']['use_data'] == self.tr('Bayes factor'):
collocates_stat_files = {
colligation: numpy.array(stats_files)[:, 2]
for colligation, stats_files in colligations_stats_files.items()
}
elif settings['fig_settings']['use_data'] == col_text_effect_size:
collocates_stat_files = {
colligation: numpy.array(stats_files)[:, 3]
for colligation, stats_files in colligations_stats_files.items()
}
wl_figs_stats.wl_fig_stats(
self.main, collocates_stat_files,
tab = 'colligation_extractor'
)
# Hide the progress dialog early so that the main window will not obscure the generated figure
self.worker_colligation_extractor_fig.dialog_progress.accept()
wl_figs.show_fig()
except Exception:
err_msg = traceback.format_exc()
finally:
wl_checks_work_area.check_err_fig(self.main, err_msg)
# self.tr() does not work in inherited classes
class Wl_Worker_Colligation_Extractor(wl_threading.Wl_Worker):
worker_done = pyqtSignal(str, dict, dict)
def __init__(self, main, dialog_progress, update_gui):
super().__init__(main, dialog_progress, update_gui)
self.err_msg = ''
self.colligations_freqs_files = []
self.colligations_stats_files = []
def run(self):
try:
colligations_freqs_files_all = []
settings = self.main.settings_custom['colligation_extractor']
files = list(self.main.wl_file_area.get_selected_files())
window_left = settings['generation_settings']['window_left']
window_right = settings['generation_settings']['window_right']
# Calculate window size
if window_left < 0 < window_right:
window_size = window_right - window_left
else:
window_size = window_right - window_left + 1
# Frequency
for i, file in enumerate(files):
colligations_freqs_file = {}
colligations_freqs_file_all = {}
text = wl_token_processing.wl_process_tokens_colligation_extractor(
self.main, file['text'],
token_settings = settings['token_settings'],
search_settings = settings['search_settings']
)
tokens = text.get_tokens_flat()
(
offsets_paras,
offsets_sentences,
offsets_sentence_segs
) = text.get_offsets()
search_terms = wl_matching.match_search_terms_ngrams(
self.main, tokens,
lang = text.lang,
token_settings = settings['token_settings'],
search_settings = settings['search_settings']
)
(
search_terms_incl,
search_terms_excl
) = wl_matching.match_search_terms_context(
self.main, tokens,
lang = text.lang,
token_settings = settings['token_settings'],
context_settings = settings['search_settings']['context_settings']
)
if search_terms:
len_search_term_min = min((len(search_term) for search_term in search_terms))
len_search_term_max = max((len(search_term) for search_term in search_terms))
else:
len_search_term_min = 1
len_search_term_max = 1
len_paras = len(offsets_paras)
len_sentences = len(offsets_sentences)
len_sentence_segs = len(offsets_sentence_segs)
settings_limit_searching = settings['generation_settings']['limit_searching']
for ngram_size in range(len_search_term_min, len_search_term_max + 1):
colligations_freqs_file_all[ngram_size] = collections.Counter()
for i, ngram in enumerate(wl_nlp_utils.ngrams(tokens, ngram_size)):
# Limit Searching
if settings_limit_searching != _tr('Wl_Worker_Colligation_Extractor', 'None'):
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'Within sentence segments'):
offsets_unit = offsets_sentence_segs
len_unit = len_sentence_segs
elif settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'Within sentences'):
offsets_unit = offsets_sentences
len_unit = len_sentences
elif settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'Within paragraphs'):
offsets_unit = offsets_paras
len_unit = len_paras
i_unit = bisect.bisect(offsets_unit, i) - 1
i_unit_start = offsets_unit[i_unit]
i_unit_end = offsets_unit[i_unit + 1] - 1 if i_unit < len_unit - 1 else text.num_tokens - 1
# Extract collocates
tags_left = []
tags_right = []
if window_left < 0 < window_right:
# Limit Searching
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
tags_left = text.tags[max(0, i + window_left) : i]
tags_right = text.tags[i + ngram_size : i + ngram_size + window_right]
else:
# Span positions (Left)
for position in range(max(0, i + window_left), i):
if i_unit_start <= position <= i_unit_end:
tags_left.append(text.tags[position])
# Span positions (Right)
for position in range(i + ngram_size, i + ngram_size + window_right):
if i_unit_start <= position <= i_unit_end:
tags_right.append(text.tags[position])
for j, collocate in enumerate(reversed(tags_left)):
if wl_matching.check_context(
i, tokens,
context_settings = settings['search_settings']['context_settings'],
search_terms_incl = search_terms_incl,
search_terms_excl = search_terms_excl
):
if (ngram, collocate) not in colligations_freqs_file:
colligations_freqs_file[(ngram, collocate)] = [0] * window_size
colligations_freqs_file[(ngram, collocate)][abs(window_left) - 1 - j] += 1
colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
for j, collocate in enumerate(tags_right):
if wl_matching.check_context(
i, tokens,
context_settings = settings['search_settings']['context_settings'],
search_terms_incl = search_terms_incl,
search_terms_excl = search_terms_excl
):
if (ngram, collocate) not in colligations_freqs_file:
colligations_freqs_file[(ngram, collocate)] = [0] * window_size
colligations_freqs_file[(ngram, collocate)][abs(window_left) + j] += 1
colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
elif window_left < 0 and window_right < 0:
# Limit Searching
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
tags_left = text.tags[max(0, i + window_left) : max(0, i + window_right + 1)]
else:
# Span positions (Left)
for position in range(max(0, i + window_left), max(0, i + window_right + 1)):
if i_unit_start <= position <= i_unit_end:
tags_left.append(text.tags[position])
for j, collocate in enumerate(reversed(tags_left)):
if wl_matching.check_context(
i, tokens,
context_settings = settings['search_settings']['context_settings'],
search_terms_incl = search_terms_incl,
search_terms_excl = search_terms_excl
):
if (ngram, collocate) not in colligations_freqs_file:
colligations_freqs_file[(ngram, collocate)] = [0] * window_size
colligations_freqs_file[(ngram, collocate)][window_size - 1 - j] += 1
colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
elif window_left > 0 and window_right > 0:
# Limit Searching
if settings_limit_searching == _tr('Wl_Worker_Colligation_Extractor', 'None'):
tags_right = text.tags[i + ngram_size + window_left - 1 : i + ngram_size + window_right]
else:
# Span positions (Right)
for position in range(i + ngram_size + window_left - 1, i + ngram_size + window_right):
if i_unit_start <= position <= i_unit_end:
tags_right.append(text.tags[position])
for j, collocate in enumerate(tags_right):
if wl_matching.check_context(
i, tokens,
context_settings = settings['search_settings']['context_settings'],
search_terms_incl = search_terms_incl,
search_terms_excl = search_terms_excl
):
if (ngram, collocate) not in colligations_freqs_file:
colligations_freqs_file[(ngram, collocate)] = [0] * window_size
colligations_freqs_file[(ngram, collocate)][j] += 1
colligations_freqs_file_all[ngram_size][(ngram, collocate)] += 1
colligations_freqs_file = {
(ngram, collocate): freqs
for (ngram, collocate), freqs in colligations_freqs_file.items()
if all(ngram) and collocate
}
# Filter search terms
colligations_freqs_file_filtered = {}
for search_term in search_terms:
len_search_term = len(search_term)
for (node, collocate), freqs in colligations_freqs_file.items():
for ngram in wl_nlp_utils.ngrams(node, len_search_term):
if ngram == search_term:
colligations_freqs_file_filtered[(node, collocate)] = freqs
self.colligations_freqs_files.append(colligations_freqs_file_filtered)
# Frequency (All)
colligations_freqs_files_all.append(colligations_freqs_file_all)
# Total
if len(files) > 1:
colligations_freqs_total = {}
colligations_freqs_total_all = {}
# Frequency
for colligations_freqs_file in self.colligations_freqs_files:
for colligation, freqs in colligations_freqs_file.items():
if colligation not in colligations_freqs_total:
colligations_freqs_total[colligation] = freqs
else:
colligations_freqs_total[colligation] = list(map(operator.add, colligations_freqs_total[colligation], freqs))
# Frequency (All)
for colligations_freqs_file_all in colligations_freqs_files_all:
for ngram_size, colligations_freqs in colligations_freqs_file_all.items():
if ngram_size not in colligations_freqs_total_all:
colligations_freqs_total_all[ngram_size] = collections.Counter()
colligations_freqs_total_all[ngram_size] += colligations_freqs
self.colligations_freqs_files.append(colligations_freqs_total)
colligations_freqs_files_all.append(colligations_freqs_total_all)
test_statistical_significance = settings['generation_settings']['test_statistical_significance']
measure_bayes_factor = settings['generation_settings']['measure_bayes_factor']
measure_effect_size = settings['generation_settings']['measure_effect_size']
func_statistical_significance = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['func']
func_bayes_factor = self.main.settings_global['measures_bayes_factor'][measure_bayes_factor]['func']
func_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['func']
colligations_all = self.colligations_freqs_files[-1].keys()
num_colligations_all = len(colligations_all)
# Used for z-score (Berry-Rogghe)
span = (abs(window_left) + abs(window_right)) / 2
for colligations_freqs_file, colligations_freqs_file_all in zip(
self.colligations_freqs_files,
colligations_freqs_files_all
):
if any((func_statistical_significance, func_bayes_factor, func_effect_size)):
colligations_stats_file = {}
o1xs = collections.Counter()
ox1s = collections.Counter()
oxxs = {}
for ngram_size, colligations_freqs in colligations_freqs_file_all.items():
o1xs[ngram_size] = collections.Counter()
ox1s[ngram_size] = collections.Counter()
for (node, collocate), freq in colligations_freqs.items():
o1xs[ngram_size][collocate] += freq
ox1s[ngram_size][node] += freq
oxxs[ngram_size] = sum(colligations_freqs.values())
o11s = numpy.empty(shape = num_colligations_all, dtype = float)
o12s = numpy.empty(shape = num_colligations_all, dtype = float)
o21s = numpy.empty(shape = num_colligations_all, dtype = float)
o22s = numpy.empty(shape = num_colligations_all, dtype = float)
for i, (node, collocate) in enumerate(colligations_all):
len_node = len(node)
o11s[i] = sum(colligations_freqs_file.get((node, collocate), [0]))
o12s[i] = o1xs[len_node][collocate] - o11s[i]
o21s[i] = ox1s[len_node][node] - o11s[i]
o22s[i] = oxxs[len_node] - o11s[i] - o12s[i] - o21s[i]
# Test Statistic & p-value
if test_statistical_significance == 'none':
test_stats = [None] * num_colligations_all
p_vals = [None] * num_colligations_all
else:
if test_statistical_significance == 'z_score_berry_rogghe':
test_stats, p_vals = func_statistical_significance(self.main, o11s, o12s, o21s, o22s, span)
else:
test_stats, p_vals = func_statistical_significance(self.main, o11s, o12s, o21s, o22s)
# Bayes Factor
if measure_bayes_factor == 'none':
bayes_factors = [None] * num_colligations_all
else:
bayes_factors = func_bayes_factor(self.main, o11s, o12s, o21s, o22s)
# Effect Size
if measure_effect_size == 'none':
effect_sizes = [None] * num_colligations_all
else:
effect_sizes = func_effect_size(self.main, o11s, o12s, o21s, o22s)
for i, (node, collocate) in enumerate(colligations_all):
colligations_stats_file[(node, collocate)] = [
test_stats[i],
p_vals[i],
bayes_factors[i],
effect_sizes[i]
]
else:
colligations_stats_file = {
(node, collocate): [None] * 4
for node, collocate in colligations_all
}
self.colligations_stats_files.append(colligations_stats_file)
if len(files) == 1:
self.colligations_freqs_files *= 2
self.colligations_stats_files *= 2
except Exception:
self.err_msg = traceback.format_exc()
class Wl_Worker_Colligation_Extractor_Table(Wl_Worker_Colligation_Extractor):
def run(self):
super().run()
self.progress_updated.emit(self.tr('Rendering table...'))
self.worker_done.emit(
self.err_msg,
wl_misc.merge_dicts(self.colligations_freqs_files),
wl_misc.merge_dicts(self.colligations_stats_files)
)
class Wl_Worker_Colligation_Extractor_Fig(Wl_Worker_Colligation_Extractor):
def run(self):
super().run()
self.progress_updated.emit(self.tr('Rendering figure...'))
self.worker_done.emit(
self.err_msg,
wl_misc.merge_dicts(self.colligations_freqs_files),
wl_misc.merge_dicts(self.colligations_stats_files)
)