embiggen/utils/abstract_models/abstract_embedding_model.py from monarch-initiative/N2V

embiggen/utils/abstract_models/abstract_embedding_model.py
Summary

Maintainability

1 hr
Test Coverage

Issues
"""Module providing abstract classes for embedding models."""
from typing import Dict, Any, Optional, Union
from ensmallen import Graph
from ensmallen.datasets import get_dataset
import warnings
from cache_decorator import Cache
from embiggen.utils.abstract_models.abstract_model import AbstractModel, abstract_class
from embiggen.utils.abstract_models.embedding_result import EmbeddingResult


@abstract_class
class AbstractEmbeddingModel(AbstractModel):
    """Class defining properties of an abstract embedding model."""

    def __init__(
        self,
        embedding_size: Optional[int] = None,
        enable_cache: bool = False,
        ring_bell: bool = False,
        random_state: Optional[int] = None
    ):
        """Create new embedding model.

        Parameters
        ---------------------
        embedding_size: Optional[int] = None
            The dimensionality of the embedding.
        enable_cache: bool = False
            Whether to enable the cache, that is to
            store the computed embedding.
        ring_bell: bool = False
            Whether to play a sound when embedding completes.
        random_state: Optional[int] = None
            The random state to use if the model is stocastic.
        """
        super().__init__(random_state=random_state)
        if embedding_size is not None and not isinstance(embedding_size, int) or embedding_size == 0:
            raise ValueError(
                "The embedding size, if provided, should be a strictly positive integer "
                f"but {embedding_size} was provided."
            )
        self._embedding_size = embedding_size
        self._enable_cache = enable_cache

        try:
            from ringbell import RingBell
            self._ring_bell = RingBell(
                verbose=ring_bell,
                sample="positive_notification"
            )
        except ModuleNotFoundError:
            self._ring_bell = None

    def parameters(self) -> Dict[str, Any]:
        """Returns parameters of the embedding model."""
        return dict(
            **super().parameters(),
            **(dict(embedding_size=self._embedding_size) if self._embedding_size is not None else dict())
        )

    @classmethod
    def requires_nodes_sorted_by_decreasing_node_degree(cls) -> bool:
        """Returns whether this embedding requires the node degrees to be sorted."""
        raise NotImplementedError((
            "The `requires_nodes_sorted_by_decreasing_node_degree` method must be implemented "
            "in the child classes of abstract model."
        ))

    @classmethod
    def get_minimum_required_number_of_node_types(cls) -> int:
        """Requires minimum number of required node types."""
        return 0

    def _fit_transform(
        self,
        graph: Graph,
        return_dataframe: bool = True,
    ) -> EmbeddingResult:
        """Run embedding on the provided graph.

        Parameters
        --------------------
        graph: Graph
            The graph to run predictions on.
        """
        raise NotImplementedError((
            "The `_fit_transform` method must be implemented "
            "in the child classes of abstract model."
        ))

    @Cache(
        cache_path="{cache_dir}/{self.model_name()}/{self.library_name()}/{graph.get_name()}/{_hash}.pkl.gz",
        cache_dir="embedding",
        enable_cache_arg_name="self._enable_cache",
    )
    def _cached_fit_transform(
        self,
        graph: Graph,
        return_dataframe: bool = True,
    ) -> EmbeddingResult:
        """Execute embedding on the provided graph.

        Parameters
        --------------------
        graph: Graph
            The graph to run embedding on.
        return_dataframe: bool = True
            Whether to return a pandas DataFrame with the embedding.

        Returns
        --------------------
        An embedding result, wrapping the complexity of a generic embedding.
        """
        if not graph.has_nodes():
            raise ValueError(
                f"The provided graph {graph.get_name()} is empty."
            )

        if self.requires_nodes_sorted_by_decreasing_node_degree():
            if not graph.has_nodes_sorted_by_decreasing_outbound_node_degree():
                raise ValueError(
                    f"The given graph {graph.get_name()} does not have the nodes sorted by decreasing "
                    "order, therefore the negative sampling (which follows a scale free "
                    "distribution) would not approximate well the Softmax.\n"
                    "In order to sort the given graph in such a way that the node IDs "
                    "are sorted by decreasing outbound node degrees, you can use "
                    "the Graph method `graph.sort_by_decreasing_outbound_node_degree()`."
                )

        if self.requires_node_types() and not graph.has_node_types():
            raise ValueError(
                f"The provided graph {graph.get_name()} does not have node types, but "
                f"the {self.model_name()} requires node types."
            )
        
        if self.requires_node_types() and graph.get_number_of_node_types() <= 1:
            raise ValueError(
                f"The {self.model_name()} requires the graph to have "
                f"at least {self.get_minimum_required_number_of_node_types()} node types, "
                f"but the provided one has {graph.get_number_of_node_types()} "
                "node types."
            )

        if self.requires_edge_types() and not graph.has_edge_types():
            raise ValueError(
                f"The provided graph {graph.get_name()} does not have edge types, but "
                f"the {self.model_name()} requires edge types."
            )

        if self.requires_edge_weights() and not graph.has_edge_weights():
            raise ValueError(
                f"The provided graph {graph.get_name()} does not have edge weights, but "
                f"the {self.model_name()} requires edge weights."
            )

        if self.requires_positive_edge_weights() and graph.has_edge_weights() and graph.has_negative_edge_weights():
            raise ValueError(
                f"The provided graph {graph.get_name()} has negative edge weights, but "
                f"the {self.model_name()} requires strictly positive edge weights."
            )

        if self.is_topological():
            if not graph.has_edges():
                raise ValueError(
                    f"The provided graph {graph.get_name()} does not have edges."
                )

            if graph.has_disconnected_nodes():
                warnings.warn(
                    (
                        f"Please be advised that the {graph.get_name()} graph "
                        f"contains {graph.get_number_of_disconnected_nodes()} disconnected nodes. "
                        "Consider that node embedding algorithms that only use topological information "
                        "such as CBOW, GloVe, SPINE and SkipGram are not able to provide meaningful "
                        "embeddings for these nodes, and their embedding will be generally "
                        "far away from any other node. It is also possible that all disconnected nodes "
                        "will receive a relatively similar node embedding. "
                        "Consider dropping them by using the `graph.remove_disconnected_nodes()` method."
                    )
                )

        result = self._fit_transform(
            graph=graph,
            return_dataframe=return_dataframe,
        )

        if not isinstance(result, EmbeddingResult):
            raise NotImplementedError(
                f"The embedding result produced by the {self.model_name()} method "
                f"from the library {self.library_name()} implemented in the class "
                f"called {self.__class__.__name__} does not return an Embeddingresult "
                f"but returns an object of type {type(result)}."
            )

        if self._ring_bell is not None:
            self._ring_bell.play()

        return result

    def fit_transform(
        self,
        graph: Union[Graph, str],
        repository: Optional[str] = None,
        version: Optional[str] = None,
        return_dataframe: bool = True,
    ) -> EmbeddingResult:
        """Execute embedding on the provided graph.

        Parameters
        --------------------
        graph: Graph
            The graph to run embedding on.
        repository: Optional[str] = None
            The repository from where to retrieve these graphs.
            This only applies for the graph names that are available
            from the ensmallen automatic retrieval.
        version: Optional[str] = None
            The version of the graphs to be retrieved.
            When this is left to none, the retrieved version will be
            the one that has been indicated to be the most recent one.
            This only applies for the graph names that are available
            from the ensmallen automatic retrieval.
        return_dataframe: bool = True
            Whether to return a pandas DataFrame with the embedding.

        Returns
        --------------------
        An embedding result, wrapping the complexity of a generic embedding.
        """
        if isinstance(graph, str):
            graph = get_dataset(
                name=graph,
                repository=repository,
                version=version
            )()
        if return_dataframe and graph.get_number_of_nodes() > 100_000_000:
            raise ValueError(
                (
                    "We cowardly refuse to execute this embedding with the "
                    "added requirement to also return the dataframe version "
                    "of this graph. This graph has {number_of_nodes}, and "
                    "creating a Dataframe would most likely cause an OOM on "
                    "your system."
                ).format(
                    number_of_nodes=graph.get_number_of_nodes()
                )
            )
        return self._cached_fit_transform(
            graph=graph,
            return_dataframe=return_dataframe,
        )
    
    @classmethod
    def can_use_edge_type_features(cls) -> bool:
        return False
    
    @classmethod
    def can_use_edge_features(cls) -> bool:
        return False