datanator_query_python/query/query_uniprot_org.py from KarrLab/datanator_query_python

datanator_query_python/query/query_uniprot_org.py
Summary

Maintainability

1 hr
Test Coverage

90%
Issues
Coverage
"""For querying uniprot.org using uniprot API
(https://www.uniprot.org/help/api_queries)
"""
import requests
from bs4 import BeautifulSoup
import re


class QueryUniprotOrg:

    def __init__(self, query, api='https://www.uniprot.org/uniprot/?', include='yes', compress='no',
                limit=1, offset=0):
        """Init
        
        Args:
            query (:obj:`str`): query message.
            url (:obj:`int`, optional): API url.
            include (:obj:`str`, optional): See description in link. Defaults to 'yes'.
            compress (:obj:`str`, optional): Return results gzipped. Defaults to 'no'.
            limit (:obj:`int`, optional): Max number of results to return. Defaults to 1.
            offset (:obj:`int`, optional): Offset of the first result. Defaults to 0.
        """
        columns = 'id,entry name,genes(PREFERRED),protein names,sequence,length,mass,ec,database(GeneID),reviewed,organism-id,database(KO),genes(ALTERNATIVE),genes(ORF),genes(OLN),database(EMBL),database(RefSeq),database(KEGG)'
        suffix = 'query={}&sort=score&columns={}format={}&include={}&compress={}&limit={}&offset={}'.format(
                  query, columns, 'html', include, compress, limit, offset)
        url = api + suffix
        response = requests.get(url)
        self.soup = BeautifulSoup(response.content, 'html.parser')

    def get_kegg_ortholog(self):
        """Get kegg ortholog information using query message.

        Return:
            (:obj:`str`): kegg ortholog number
        """
        rx = re.compile(".*dbget-bin.*")
        result = self.soup.find_all(href=rx)
        if result != []:
            return result[0].get_text()
        else:
            return None

    def get_uniprot_id(self):
        """Get uniprot id.

        Return:
            (:obj:`str`): uniprot id
        """
        result = self.soup.find_all(class_='basket-item namespace-uniprot')
        if result != []:
            return result[0]['id'].split('_')[1]
        else:
            return None

    def get_protein_name(self):
        """Get protein name.

        Return:
            (:obj:`list` of :obj:`str`): list of protein names.
        """
        result = []
        tmp = self.soup.find_all(class_='protein_names')
        if tmp != []:
            result.append(tmp[0].div['title'])
            alt_names = tmp[0].div.next_sibling.get_text()
            atl_name_split = alt_names.strip().split(')  (')
            for x in atl_name_split:
                for y in (x.split(',')):
                    result.append(y.strip())
            return result
        else:
            return None