reimandlab/Visualistion-Framework-for-Genome-Mutations

View on GitHub
website/views/filters.py

Summary

Maintainability
B
4 hrs
Test Coverage
from sqlalchemy import exists

from models import MC3Mutation, Disease, source_manager, SiteType, PCAWGMutation
from models.bio.drug import DrugGroup, Drug
from models import Cancer
from models import Mutation
from models import Site
from models import The1000GenomesMutation
from models import ExomeSequencingMutation
from models import ClinicalData
from database import has_or_any, db
from helpers.filters import Filter
from helpers.widgets import FilterWidget


class FiltersData:
    """State transfer object AsyncFiltersHandler from filters.js"""

    def __init__(self, filter_manager):
        from flask import request
        self.query = filter_manager.url_string() or ''
        self.expanded_query = filter_manager.url_string(expanded=True) or ''
        self.checksum = request.args.get('checksum', '')
        self.dynamic_widgets = ''

    def to_json(self):
        return self.__dict__


class ProteinFiltersData(FiltersData):

    def __init__(self, filter_manager, protein):
        super().__init__(filter_manager)
        from flask import render_template
        self.dynamic_widgets = render_template(
            'widgets/widget_list.html',
            widgets=create_dataset_specific_widgets(protein, filter_manager.filters),
            collapse=True
        )


def populations_labels(populations):
    return [
        population_name + ' (' + field_name[4:].upper() + ')'
        for field_name, population_name
        in populations.items()
    ]


class SourceDependentFilter(Filter):

    def __init__(self, *args, **kwargs):
        self.source = kwargs.pop('source')
        super().__init__(*args, **kwargs)

    @property
    def visible(self):
        return self.manager.get_value('Mutation.sources') == self.source


class MutationDetailsFilter(SourceDependentFilter):
    """Mutation details restrict returned mutations to those
    which have at least on MutationDetails passing given criteria.

    Example mutation details attributes include:
        disease_name, cancer_type, population_name and so on

    As there may be hundreds of mutations per protein, to improve speed
    of filtering, the MutationDetails filters must be defined in a way
    which allows conversion to database-side SQL 'where' clauses.

    This means that 'as_sqlalchemy' will be set to True by default and
    it's strongly discouraged to turned it off; Still, one can overwrite
    as_sqlalchemy with a function defining custom sqlalchemy filter.

    Also the filters should be constructed in a way that the default
    value is equivalent to having the filter disabled. If it is not
    possible to achieve, one need to provide 'skip_if_default=False'
    """

    def __init__(
        self, target_details_class, attribute,
        nullable=False, as_sqlalchemy=True, skip_if_default=True,
        **kwargs
    ):
        target = [Mutation, target_details_class]
        super().__init__(
            target, attribute,
            nullable=nullable, as_sqlalchemy=as_sqlalchemy, skip_if_default=skip_if_default,
            **kwargs
        )


def sqlalchemy_filter_from_source_name(source_name):
    """Adapt mutation source filter to SQLAlchemy clause (for use in mutation query)"""
    if source_name == 'user':
        return True
    field_name = source_manager.visible_fields[source_name]
    field = getattr(Mutation, field_name)
    return has_or_any(field)


class UserMutations:
    pass


def create_dataset_labels():
    # map dataset display names to dataset names
    dataset_labels = {
        dataset.name: dataset.display_name
        for dataset in source_manager.all
    }
    # hide user's mutations in dataset choice
    # (there is separate widget for that, shown only if there are any user's datasets)
    dataset_labels['user'] = None
    return dataset_labels


class HashableDict(dict):
    def __hash__(self):
        return hash(tuple(self.items()))


class CachedQueries:

    def __init__(self):
        self.reload()

    def reload(self):
        """Should be called after each cancer and public-dataset addition or change
        (It should not happen during normal service, only after migrations and during tests)
        """
        self.drug_groups = sorted([group.name for group in DrugGroup.query])

        self.all_disease_names_by_id = HashableDict({
            disease.id: disease.name
            for disease in sorted(Disease.query, key=lambda disease: disease.name.lower())
        })
        self.all_cancer_codes_mc3 = [
            cancer.code for cancer in Cancer.query
            if db.session.query(exists().where(MC3Mutation.cancer == cancer)).scalar()
        ]
        self.all_cancer_codes_pcawg = [
            cancer.code for cancer in Cancer.query
            if db.session.query(exists().where(PCAWGMutation.cancer == cancer)).scalar()
        ]
        self.all_cancer_names = {
            cancer.code: f'{cancer.name} ({cancer.code})'
            for cancer in Cancer.query
        }
        self.dataset_labels = create_dataset_labels()

        self.site_types = SiteType.query.all()


cached_queries = CachedQueries()


def common_filters(
    protein,
    default_source='MC3',
    source_nullable=False,
    custom_datasets_ids=[]
):

    return [
        Filter(
            Mutation, 'sources', comparators=['in'],
            choices=list(source_manager.visible_fields.keys()),
            default=default_source, nullable=source_nullable,
            as_sqlalchemy=sqlalchemy_filter_from_source_name
        ),
        Filter(
            UserMutations, 'sources', comparators=['in'],
            choices=list(custom_datasets_ids),
            default=None, nullable=True
        ),
        Filter(
            Mutation, 'is_ptm', comparators=['eq']
        ),
        Filter(
            Drug, 'groups.name', comparators=['in'],
            nullable=False,
            choices=cached_queries.drug_groups,
            default=['approved'],
            multiple='all',
            as_sqlalchemy=True
        ),
        Filter(
            Site, 'types', comparators=['in'],
            choices={
                site_type.name: site_type
                for site_type in SiteType.available_types()
            },
            custom_comparators={'in': SiteType.fuzzy_comparator},
            as_sqlalchemy=SiteType.fuzzy_filter,
            as_sqlalchemy_joins=[Site.types]
        )
    ] + source_dependent_filters(protein)


def source_dependent_filters(protein=None):

    if protein:
        cancer_codes_mc3 = protein.cancer_codes(MC3Mutation)
        cancer_codes_pcawg = protein.cancer_codes(PCAWGMutation)
        disease_names_by_id = protein.disease_names_by_id
    else:
        cancer_codes_mc3 = cached_queries.all_cancer_codes_mc3
        cancer_codes_pcawg = cached_queries.all_cancer_codes_pcawg
        disease_names_by_id = cached_queries.all_disease_names_by_id

    # Python 3.4: cast keys() to list
    populations_1kg = list(The1000GenomesMutation.populations.values())
    populations_esp = list(ExomeSequencingMutation.populations.values())
    significances = list(ClinicalData.significance_codes.keys())

    default_significances = {'Pathogenic', 'Pathogenic/Likely pathogenic', 'Likely pathogenic'}
    default_significance_codes = [
        code
        for code, significance in ClinicalData.significance_codes.items()
        if significance in default_significances
    ]

    gold_starts = [*sorted(ClinicalData.stars_by_status.values(), reverse=True), -1]

    return [
        MutationDetailsFilter(
            MC3Mutation, 'mc3_cancer_code',
            comparators=['in'],
            choices=cached_queries.all_cancer_codes_mc3,
            default=cancer_codes_mc3,
            source='MC3',
            multiple='any',
        ),
        MutationDetailsFilter(
            PCAWGMutation, 'pcawg_cancer_code',
            comparators=['in'],
            choices=cached_queries.all_cancer_codes_pcawg,
            default=cancer_codes_pcawg,
            source='PCAWG',
            multiple='any',
        ),
        MutationDetailsFilter(
            The1000GenomesMutation, 'populations_1KG',
            comparators=['in'],
            choices=populations_1kg,
            default=populations_1kg,
            source='1KGenomes',
            multiple='any',
        ),
        MutationDetailsFilter(
            ExomeSequencingMutation, 'populations_ESP6500',
            comparators=['in'],
            choices=populations_esp,
            default=populations_esp,
            source='ESP6500',
            multiple='any'
        ),
        MutationDetailsFilter(
            ClinicalData, 'sig_code',
            comparators=['in'],
            choices=significances,
            default=default_significance_codes,
            source='ClinVar',
            skip_if_default=False,
            multiple='any',
        ),
        MutationDetailsFilter(
            ClinicalData, 'gold_stars',
            comparators=['in'],
            choices=gold_starts,
            default=gold_starts,
            source='ClinVar',
            multiple='any',
        ),
        # We use disease id by default (instead of disease name)
        # because the names often include some characters or
        # symbols which make escaping and encoding tricky and
        # might cause server to crash; also querying a lot of
        # diseases at a time caused the url queries to exceed
        # limits back then when the diseases where filtered by
        # names and not identifiers.
        MutationDetailsFilter(
            ClinicalData, 'disease_id',
            comparators=['in'],
            # casting to list() proved to be necessary
            # as we would otherwise get wrong result of
            # comparison: f.value != f.default
            # (due to types difference)
            choices=list(disease_names_by_id.keys()),
            default=list(disease_names_by_id.keys()),
            source='ClinVar',
            multiple='any',
        ),
        # Disease_name is not visible from the user interface,
        # but was added to provide backward compatibility with
        # 1.0 version of ActiveDriverDB and enable creation of
        # API queries knowing only the disease names (not our
        # internal, volatile disease identifiers)
        MutationDetailsFilter(
            ClinicalData, 'disease_name',
            comparators=['in'],
            # casting to list(): see note in disease_id
            choices=list(disease_names_by_id.values()),
            default=list(disease_names_by_id.values()),
            source='ClinVar',
            multiple='any',
        ),
    ]


def create_dataset_specific_widgets(protein, filters_by_id, population_widgets=True):
    if protein:
        cancer_codes_mc3 = protein.cancer_codes(MC3Mutation)
        cancer_codes_pcawg = protein.cancer_codes(PCAWGMutation)
        disease_names_by_id = protein.disease_names_by_id
    else:
        cancer_codes_mc3 = []
        cancer_codes_pcawg = []
        disease_names_by_id = cached_queries.all_disease_names_by_id

    widgets = [
        FilterWidget(
            'Cancer type', 'checkbox_multiple',
            filter=filters_by_id['Mutation.mc3_cancer_code'],
            labels=cached_queries.all_cancer_names,
            choices=cancer_codes_mc3,
            all_selected_label='Any cancer type'
        ),
        FilterWidget(
            'Cancer type', 'checkbox_multiple',
            filter=filters_by_id['Mutation.pcawg_cancer_code'],
            choices=cancer_codes_pcawg,
            all_selected_label='Any cancer type'
        ),
        FilterWidget(
            'Clinical significance', 'checkbox_multiple',
            filter=filters_by_id['Mutation.sig_code'],
            all_selected_label='Any clinical significance class',
            labels=ClinicalData.significance_codes.values(),
            # do not collapse this filter by default so that the user
            # # is aware that we only show a subset of mutations
            expanded=True
        ),
        FilterWidget(
            'Disease', 'checkbox_multiple',
            filter=filters_by_id['Mutation.disease_id'],
            all_selected_label='Any disease',
            labels=disease_names_by_id
        ),
        FilterWidget(
            'Gold stars', 'checkbox_multiple',
            filter=filters_by_id['Mutation.gold_stars'],
            all_selected_label='Any number of stars',
            labels=['4 stars', '3 stars', '2 stars', '1 star', '0 stars', 'unknown'],
        )
    ]
    if population_widgets:
        widgets += [
            FilterWidget(
                'Ethnicity', 'checkbox_multiple',
                filter=filters_by_id['Mutation.populations_1KG'],
                labels=populations_labels(The1000GenomesMutation.populations),
                all_selected_label='Any ethnicity'
            ),
            FilterWidget(
                'Ethnicity', 'checkbox_multiple',
                filter=filters_by_id['Mutation.populations_ESP6500'],
                labels=populations_labels(ExomeSequencingMutation.populations),
                all_selected_label='Any ethnicity'
            )
        ]
    return widgets


def create_widgets(protein, filters_by_id, custom_datasets_names=None):
    """Widgets to be displayed on a bar above visualisation."""

    return {
        'dataset': FilterWidget(
            'Mutation dataset', 'radio',
            filter=filters_by_id['Mutation.sources'],
            labels=cached_queries.dataset_labels,
            class_name='dataset-widget'
        ),
        'custom_dataset': FilterWidget(
            'Custom mutation dataset', 'radio',
            filter=filters_by_id['UserMutations.sources'],
            labels=custom_datasets_names
        ),
        'dataset_specific': create_dataset_specific_widgets(protein, filters_by_id),
        'is_ptm': FilterWidget(
            'PTM mutations only', 'checkbox',
            filter=filters_by_id['Mutation.is_ptm'],
            disabled_label='all mutations',
            labels=['PTM mutations only']
        ),
        'ptm_type': FilterWidget(
            'Type of PTM site', 'radio',
            filter=filters_by_id['Site.types'],
            disabled_label='Any site',
            hierarchy={
                site_type.name: [sub_type.name for sub_type in site_type.sub_types]
                for site_type in cached_queries.site_types
                if site_type.sub_types
            }
        ),
        'other': [FilterWidget(
            'Drug group', 'checkbox_multiple',
            filter=filters_by_id['Drug.groups.name'],
            labels=[group.title() for group in cached_queries.drug_groups],
        )]
    }