matteoferla/Python_SmallWorld_API

View on GitHub
smallworld_api/__init__.py

Summary

Maintainability
B
4 hrs
Test Coverage
from __future__ import annotations

"""
The primary class in the smallworld_api module is ``SmallWorld``.
All other classes, in other files are inherited by this.
"""

__all__ = ['SmallWorld', 'NoMatchError']

from warnings import warn
import pandas as pd
from .defaults import Defaults  # class attributes
from .base import Base  # inherits Defaults
from .extras import Extras  # extra methods not required by search
from typing import *
from .nomatcherror import NoMatchError
from .search import Searcher
import time, sys

if TYPE_CHECKING or 'sphinx' in sys.modules:
    from rdkit import Chem


class SmallWorld(Searcher):
    """
    A python3 API based upon https://wiki.docking.org/index.php/How_to_use_SmallWorld_API

    This (``SmallWorld``) is the main class of the module ``smallworld_api`` and is split into separate files
    by functionality. The classes it bases are as follows:
    Defaults -> Common -> Base -> Extras -> Searcher -> SmallWorld

    For example, class attributes are in Defaults.
    """

    def __init__(self, update_dbs: bool = True):
        """
        Initialisation results in the updating of the databases.
        """
        super().__init__()
        if update_dbs and not self._db_updated:
            self.retrieve_databases()

    def search_smiles(self,
                      smiles: str,
                      db: str,
                      **other_parameters) -> pd.DataFrame:
        """
        Given a smiles and a database return the table of results!

        The optional arguments are:

        * dist = 10 (atom difference distance threshhold)
        * several in `.default_submission`...

        The number of results given are controlled by:

        * length = 10 (number of results)
        * draw = 10 (pointless atm)
        * start = 0

        Which are passed onto `.get_results`.

        Calls ``submit_query`` and then ``get_results`` (code in ``search.py``)
        Returns a pandas dataframe of results.
        The dataframe is not rdkit modified yet.
        """
        start = int(other_parameters['start']) if 'start' in other_parameters else 0
        dist = int(other_parameters['dist']) if 'dist' in other_parameters else 0
        length = int(other_parameters['length']) if 'length' in other_parameters else 10
        draw = int(other_parameters['draw']) if 'draw' in other_parameters else 10
        valids = {k: other_parameters[k] for k in self.valid_submit_keys if k in other_parameters}
        if db not in self.db_choices:
            warn(f'{db} is not a valid choice ({self.db_choices}).' +
                 'Check updated with `.retrieve_scorefun_options()`')
        params = {'smi': smiles,
                  'db': db,
                  **self.default_submission,
                  'dist': int(dist),
                  **valids}
        self.query_summary: Dict[str, Any] = self.submit_query(params)
        self.hit_list_id: int = self.query_summary['hlid']
        try:
            results = self.get_results(start, length, draw)
        except BaseException as error:
            warn(f'{error.__class__.__name__}: {error} was raised. ' +
                 'Retrying in 2 seconds. There may be connection issues')
            time.sleep(2)
            results = self.get_results(start, length, draw)
        return results

    def search_mol(self,
                   mol: Chem.Mol,
                   db: str,
                   **other_parameters) -> pd.DataFrame:
        smiles = self.mol2smiles(mol)
        return self.search_smiles(smiles=smiles, db=db, **other_parameters)

    def search_many(self,
                    query: Union[Sequence[Any], Mapping[str, Any]],
                    db: str,
                    **other_parameters) -> pd.DataFrame:
        """
        search for many SMILES or Chem.Mol.
        """
        results: List[pd.DataFrame] = []
        if isinstance(query, Sequence):  # list or tuple etc.
            iterator = enumerate(query)
        elif isinstance(query, Mapping):  # dict etc.
            iterator = query.items()
        elif isinstance(query, pd.Series):  # a pd.Series is not a sequence or a mapping apparently
            iterator = query.to_dict().items()
        else:
            raise TypeError(f'Unrecognised type: {type(query)} for `.search_many_smiles`')
        tick = 0
        for name, item in iterator:
            # ## what is it?
            if isinstance(item, str):
                # it's a smiles
                smiles = item
            elif self.is_this_mol(item):
                # rdkit
                smiles = self.mol2smiles(item)
            elif not item:
                # nothing
                warn(f'Falsy value {item} in the {type(query)} query')
                continue
            else:
                # mystery
                raise TypeError(f'Unrecognised type {type(item)}')
            # ## prevent excessive calls
            tock = time.time()
            if tick > tock - self.speed_threshold:
                time.sleep(self.speed_threshold - (tock - tick))
            tick = time.time()
            # ## what to capture
            if 'tolerate_all_exceptions' in other_parameters:
                tolerated_exceptions = (Exception,)
            elif 'tolerated_exceptions' in other_parameters:
                tolerated_exceptions = other_parameters['tolerated_exceptions']
            elif 'tolerate_NoMatchError' in other_parameters and other_parameters['tolerate_NoMatchError']:
                tolerated_exceptions = (NoMatchError,)
            else:
                tolerated_exceptions = ()
            # ## run!
            try:
                self.reset()
                result: pd.DataFrame = self.search_smiles(smiles=smiles, db=db, **other_parameters)
                if result.empty:
                    continue
                result['query_index'] = name
                result['query_smiles'] = smiles
                results.append(result)
            except tolerated_exceptions as error:
                warn(f'{error.__class__.__name__}: {error} for {name}')
            # end of loop
        if not results:
            raise NoMatchError('No results were found in SmallWorld. ' +
                               'Considering changing `dist` (distance by N of mismatches) or '
                               '`length` (number of results) greater than zero')
        return pd.concat(results, axis='index', ignore_index=True)

    def search(self, query: Any, db: str, **other_parameters) -> pd.DataFrame:
        """
        The query can be 
        
        * a single SMILES,
        * a rdkit.Chem.Mol
        * a list of SMILES or rdkit.Chem.Mol
        * a dictionary of SMILES or rdkit.Chem.Mol
        
        These all lead back to ``.search_smiles``, which functions as follows:
        """
        if isinstance(query, str):
            return self.search_smiles(smiles=query, db=db, **other_parameters)
        elif self.is_this_mol(query):  # rdkit is optional.
            self.search_mol(mol=query, db=db, **other_parameters)
        elif isinstance(query, Sized) and len(query) == 0:
            raise ValueError('Empty query')
        elif isinstance(query, Mapping) or isinstance(query, Sequence):
            return self.search_many(query=query, db=db, **other_parameters)
        else:
            raise TypeError(f'Unknown type {type(query)} for query')


SmallWorld.search.__doc__ += SmallWorld.search_smiles.__doc__