w4k2/stream-learn

View on GitHub
strlearn/ensembles/REA.py

Summary

Maintainability
A
2 hrs
Test Coverage
import numpy as np
from sklearn.base import clone
from ..ensembles.base import StreamingEnsemble
from sklearn.neighbors import NearestNeighbors


class REA(StreamingEnsemble):
    """
    Recursive Ensemble Approach.

    Sheng Chen, and Haibo He. "Towards incremental learning of nonstationary imbalanced data stream: a multiple selectively recursive approach." Evolving Systems 2.1 (2011): 35-50.
    """

    def __init__(self,
                 base_estimator=None,
                 n_estimators=10,
                 post_balance_ratio=0.5,
                 k_parameter=10,
                 weighted=False,
                 pruning=False):
        super().__init__(base_estimator, n_estimators, weighted=weighted)
        self.post_balance_ratio = post_balance_ratio
        self.k_parameter = k_parameter
        self.pruning = pruning

    def partial_fit(self, X, y, classes=None):
        super().partial_fit(X, y, classes)
        if not self.green_light:
            return self

        # Find minority and majority names
        if not hasattr(self, "minority_name") or not hasattr(self, "majority_name"):
            self.minority_name, self.majority_name = self.minority_majority_name(y)

        # Resample data
        res_X, res_y = self._resample(X, y)

        # Append new estimator
        self.ensemble_.append(clone(self.base_estimator).fit(res_X, res_y))

        # # Remove the worst model when ensemble becomes too large
        if self.pruning:
            if len(self.ensemble_) >= self.n_estimators:
                worst = np.argmin(self.weights_)
                del self.ensemble_[worst]
                np.delete(self.weights_, worst)

        # Recalculate the ensemble weights
        weights = []
        for clf in self.ensemble_:
            y_pred = clf.predict(X).astype(int)
            probas_ = clf.predict_proba(X)[np.arange(len(X)), y_pred]
            weights.append(np.log(1/(np.sum((1-probas_)**2)/len(X))))
        self.weights_ = np.array(weights)

        return self

    def _resample(self, X, y):
        # Split the data
        minority, majority = self.minority_majority_split(X, y, self.minority_name, self.majority_name)

        # Check if the minority data has been previously accumulated
        if not hasattr(self, "minority_data_"):
            self.minority_data_ = minority
            return X, y

        # Calculate actual imbalanced ratio
        imbalanced_ratio = len(minority[:, 0])/float(len(X[:, 0]))

        # Resample data
        if self.post_balance_ratio > imbalanced_ratio:

            knn = NearestNeighbors(n_neighbors=self.k_parameter).fit(X)

            indices = knn.kneighbors(self.minority_data_, return_distance=False)

            min_count = np.count_nonzero(y[indices] == self.minority_name, axis=1)

            a = np.arange(0, len(min_count))
            min_count = np.insert(np.expand_dims(min_count, axis=1), 1, a, axis=1)
            min_count = min_count[min_count[:, 0].argsort()]
            min_count = min_count[::-1]

            sorted_minority = min_count[:, 1].astype("int")

            n_instances = int((self.post_balance_ratio - imbalanced_ratio)*len(y))
            n_instances = min(n_instances, len(sorted_minority))
            new_minority = self.minority_data_[sorted_minority[0:n_instances]]

            res_X = np.concatenate((new_minority, majority), axis=0)
            res_y = np.concatenate((np.full(len(new_minority), self.minority_name), np.full(len(majority), self.majority_name)), axis=0)

        else:
            res_X = X
            res_y = y

        self.minority_data_ = np.concatenate((minority, self.minority_data_), axis=0)

        return res_X, res_y