website/imports/external_references.py from reimandlab/Visualistion-Framework-for-Genome-Mutations

website/imports/external_references.py
Summary

Maintainability

7 hrs
Test Coverage

Issues
from collections import defaultdict
import gzip

from database import db
from database import get_or_create
from helpers.parsers import parse_tsv_file
from helpers.parsers import iterate_tsv_gz_file
from models import Protein
from models import ProteinReferences
from models import EnsemblPeptide
from models import UniprotEntry

from sqlalchemy.orm.exc import NoResultFound


class ReferencesParser:
    def __init__(self):
        self.references = defaultdict(list)

    ensembl_references_to_collect = {
        'Ensembl_PRO': 'peptide_id'
    }

    def add_uniprot_accession(self, data):
        # full uniprot includes isoform (if relevant)
        full_uniprot, ref_type, value = data

        if ref_type == 'RefSeq_NT':
            # get protein
            refseq_nm = value.split('.')[0]

            if not refseq_nm or not refseq_nm.startswith('NM') or not full_uniprot:
                return

            try:
                protein = Protein.query.filter_by(refseq=refseq_nm).one()
            except NoResultFound:
                return

            try:
                uniprot, isoform = full_uniprot.split('-')
                isoform = int(isoform)
            except ValueError:
                # only one isoform ?
                # print('No isoform specified for', full_uniprot, refseq_nm)
                uniprot = full_uniprot
                isoform = None  # indicates "no isoform specified"

            reference, new = get_or_create(ProteinReferences, protein=protein)
            uniprot_entry, new_uniprot = get_or_create(UniprotEntry, accession=uniprot, isoform=isoform)
            reference.uniprot_entries.append(uniprot_entry)
            self.references[uniprot].append(reference)

            if new:
                db.session.add(reference)
            if new_uniprot:
                db.session.add(uniprot_entry)

    def add_references_by_uniprot(self, data):

        full_uniprot, ref_type, value = data

        if '-' in full_uniprot:
            uniprot, isoform = full_uniprot.split('-')
            uniprot_tied_references = self.references.get(uniprot, None)
            if not uniprot_tied_references:
                return

            relevant_references = []
            # select relevant references:
            for reference in uniprot_tied_references:
                if any(entry.isoform == int(isoform) for entry in reference.uniprot_entries):
                    relevant_references.append(reference)

        else:
            uniprot_tied_references = self.references.get(full_uniprot, None)
            if not uniprot_tied_references:
                return
            relevant_references = uniprot_tied_references

        if ref_type == 'UniProtKB-ID':
            # http://www.uniprot.org/help/entry_name
            # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot"
            # Entry names comes in format: X_Y;
            # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters)
            # - for TrEMBL entry X is the same as accession code (6 to 10 characters)
            x, y = value.split('_')

            if len(x) <= 5:
                for reference in relevant_references:
                    assert '-' not in full_uniprot
                    matching_entries = [entry for entry in reference.uniprot_entries if entry.accession == full_uniprot]
                    if len(matching_entries) != 1:
                        print(f'More than one match for reference: {reference}: {matching_entries}')
                    if not matching_entries:
                        print(f'No matching entries for reference: {reference}: {matching_entries}')
                        continue
                    entry = matching_entries[0]
                    entry.reviewed = True

            return

        if ref_type in self.ensembl_references_to_collect:

            attr = self.ensembl_references_to_collect[ref_type]

            for relevant_reference in relevant_references:
                attrs = {'reference': relevant_reference, attr: value}

                peptide, new = get_or_create(EnsemblPeptide, **attrs)

                if new:
                    db.session.add(peptide)

    def add_ncbi_mappings(self, data):
        # 9606    3329    HSPD1   NG_008915.1     NM_199440.1     NP_955472.1     reference standard
        taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data

        refseq_nm = refseq_nucleotide.split('.')[0]

        if not refseq_nm or not refseq_nm.startswith('NM'):
            return

        try:
            protein = Protein.query.filter_by(refseq=refseq_nm).one()
        except NoResultFound:
            return

        reference, new = get_or_create(ProteinReferences, protein=protein)

        if new:
            db.session.add(reference)

        reference.refseq_np = refseq_peptide.split('.')[0]
        reference.refseq_ng = refseq_gene.split('.')[0]
        gene = protein.gene

        if gene.name != gene_name:
            print(f'Gene name mismatch for RefSeq mappings: {gene.name} vs {gene_name}')

        entrez_id = int(entrez_id)

        if gene.entrez_id:
            if gene.entrez_id != entrez_id:
                print(f'Entrez ID mismatch for isoforms of {gene.name} gene: {gene.entrez_id}, {entrez_id}')
                if gene.name == gene_name:
                    print(
                        f'Overwriting {gene.entrez_id} entrez id with {entrez_id} for {gene.name} gene, '
                        f'because record with {entrez_id} has matching gene name'
                    )
                    gene.entrez_id = entrez_id
        else:
            gene.entrez_id = entrez_id

    def parse(
        self,
        path='data/HUMAN_9606_idmapping.dat.gz',
        refseq_lrg='data/LRG_RefSeqGene',
        refseq_link='data/refseq_link.tsv.gz'
    ):
        parse_tsv_file(refseq_lrg, self.add_ncbi_mappings, file_header=[
            '#tax_id', 'GeneID', 'Symbol', 'RSG', 'LRG', 'RNA', 't', 'Protein', 'p', 'Category'
        ])

        # add mappings retrieved from UCSC tables for completeness
        header = ['#name', 'product', 'mrnaAcc', 'protAcc', 'geneName', 'prodName', 'locusLinkId', 'omimId']
        for line in iterate_tsv_gz_file(refseq_link, header):
            gene_name, protein_full_name, refseq_nm, refseq_peptide, _, _, entrez_id, omim_id = line

            if not refseq_nm or not refseq_nm.startswith('NM'):
                continue

            try:
                protein = Protein.query.filter_by(refseq=refseq_nm).one()
            except NoResultFound:
                continue

            gene = protein.gene

            if gene.name != gene_name:
                print(f'Gene name mismatch for RefSeq mappings: {gene.name} vs {gene_name}')

            entrez_id = int(entrez_id)

            if protein_full_name:
                if protein.full_name:
                    if protein.full_name != protein_full_name:
                        print(
                            f'Protein full name mismatch: {protein.full_name} vs {protein_full_name} for {protein.refseq}'
                        )
                    continue
                protein.full_name = protein_full_name

            if gene.entrez_id:
                if gene.entrez_id != entrez_id:
                    print(f'Entrez ID mismatch for isoforms of {gene.name} gene: {gene.entrez_id}, {entrez_id}')
                    if gene.name == gene_name:
                        print(
                            f'Overwriting {gene.entrez_id} entrez id with {entrez_id} for {gene.name} gene, '
                            f'because record with {entrez_id} has matching gene name'
                        )
                        gene.entrez_id = entrez_id
            else:
                gene.entrez_id = entrez_id

            if refseq_peptide:
                reference, new = get_or_create(ProteinReferences, protein=protein)

                if new:
                    db.session.add(reference)

                if reference.refseq_np and reference.refseq_np != refseq_peptide:
                    print(
                        f'Refseq peptide mismatch between LRG and UCSC retrieved data: '
                        f'{reference.refseq_np} vs {refseq_peptide} for {protein.refseq}'
                    )

                reference.refseq_np = refseq_peptide

        parse_tsv_file(path, self.add_uniprot_accession, file_opener=gzip.open, mode='rt')
        parse_tsv_file(path, self.add_references_by_uniprot, file_opener=gzip.open, mode='rt')

        return [reference for reference_group in self.references.values() for reference in reference_group]