okfn-brasil/serenata-de-amor

View on GitHub
rosie/rosie/chamber_of_deputies/classifiers/monthly_subquota_limit_classifier.py

Summary

Maintainability
A
0 mins
Test Coverage
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin


class MonthlySubquotaLimitClassifier(TransformerMixin):
    """
    Monthly Subquota Limit classifier.

    Dataset
    -------
    issue_date : datetime column
        Date when the expense was made.

    month : int column
        The quota month matching the expense request.

    net_value : float column
        The value of the expense.

    subquota_number : category column
        A number to classify a category of expenses.

    year : int column
        The quota year matching the expense request.
    """

    KEYS = ['applicant_id', 'month', 'year']
    COLS = ['applicant_id',
            'issue_date',
            'month',
            'net_value',
            'subquota_number',
            'year']

    def fit(self, X):
        self.X = X
        self._X = self.X[self.COLS].copy()
        self.__create_columns()
        return self

    def transform(self, X=None):
        self.limits = [
            {
                # Automotive vehicle renting or charter (From 12/2013 to 03/2015)
                'data': self._X.query('(subquota_number == "120") & '
                                      '(reimbursement_month >= datetime(2013, 12, 1)) & '
                                      '(reimbursement_month <= datetime(2015, 3, 1))'),
                'monthly_limit': 1000000,
            },
            {
                # Automotive vehicle renting or charter (From 04/2015 to 04/2017)
                'data': self._X.query('(subquota_number == "120") & '
                                      '(reimbursement_month >= datetime(2015, 4, 1)) & '
                                      '(reimbursement_month <= datetime(2017, 4, 1))'),
                'monthly_limit': 1090000,
            },
            {
                # Automotive vehicle renting or charter (From 05/2017)
                'data': self._X.query('(subquota_number == "120") & '
                                      '(reimbursement_month >= datetime(2017, 5, 1))'),
                'monthly_limit': 1271300,
            },
            {
                # Taxi, toll and parking (From 12/2013 to 03/2015)
                'data': self._X.query('(subquota_number == "122") & '
                                      '(reimbursement_month >= datetime(2013, 12, 1)) & '
                                      '(reimbursement_month <= datetime(2015, 3, 1))'),
                'monthly_limit': 250000,
            },
            {
                # Taxi, toll and parking (From 04/2015)
                'data': self._X.query('(subquota_number == "122") & '
                                      '(reimbursement_month >= datetime(2015, 4, 1))'),
                'monthly_limit': 270000,
            },
            {
                # Fuels and lubricants (From 07/2009 to 03/2015)
                'data': self._X.query('(subquota_number == "3") & '
                                      '(reimbursement_month >= datetime(2009, 7, 1)) & '
                                      '(reimbursement_month <= datetime(2015, 3, 1))'),
                'monthly_limit': 450000,
            },
            {
                # Fuels and lubricants (From 04/2015 to 08/2015)
                'data': self._X.query('(subquota_number == "3") & '
                                      '(reimbursement_month >= datetime(2015, 4, 1)) & '
                                      '(reimbursement_month <= datetime(2015, 8, 1))'),
                'monthly_limit': 490000,
            },
            {
                # Fuels and lubricants (From 9/2015)
                'data': self._X.query('(subquota_number == "3") & '
                                      '(reimbursement_month >= datetime(2015, 9, 1))'),
                'monthly_limit': 600000,
            },
            {
                # Security service provided by specialized company (From 07/2009 to 4/2014)
                'data': self._X.query('(subquota_number == "8") & '
                                      '(reimbursement_month >= datetime(2009, 7, 1)) & '
                                      '(reimbursement_month <= datetime(2014, 4, 1))'),
                'monthly_limit': 450000,
            },
            {
                # Security service provided by specialized company (From 05/2014 to 3/2015)
                'data': self._X.query('(subquota_number == "8") & '
                                      '(reimbursement_month >= datetime(2014, 5, 1)) & '
                                      '(reimbursement_month <= datetime(2015, 3, 1))'),
                'monthly_limit': 800000,
            },
            {
                # Security service provided by specialized company (From 04/2015)
                'data': self._X.query('(subquota_number == "8") & '
                                      '(reimbursement_month >= datetime(2015, 4, 1))'),
                'monthly_limit': 870000,
            },
            {
                # Participation in course, talk or similar event (From 10/2015)
                'data': self._X.query('(subquota_number == "137") & '
                                      '(reimbursement_month >= datetime(2015, 10, 1))'),
                'monthly_limit': 769716,
            },
        ]
        return self

    def predict(self, X=None):
        self._X['is_over_monthly_subquota_limit'] = False
        for metadata in self.limits:
            data, monthly_limit = metadata['data'], metadata['monthly_limit']
            if len(data):
                surplus_reimbursements = self.__find_surplus_reimbursements(data, monthly_limit)
                self._X.loc[surplus_reimbursements.index,
                            'is_over_monthly_subquota_limit'] = True
        results = self._X.loc[self.X.index, 'is_over_monthly_subquota_limit']
        return np.r_[results]

    def predict_proba(self, X=None):
        return 1.

    def __create_columns(self):
        self._X['net_value_int'] = (self._X['net_value'] * 100).apply(int)

        self._X['coerced_issue_date'] = \
            pd.to_datetime(self._X['issue_date'], errors='coerce')
        self._X.sort_values('coerced_issue_date', kind='mergesort', inplace=True)

        reimbursement_month = self._X[['year', 'month']].copy()
        reimbursement_month['day'] = 1
        self._X['reimbursement_month'] = pd.to_datetime(reimbursement_month)

    def __find_surplus_reimbursements(self, data, monthly_limit):
        grouped = data.groupby(self.KEYS).apply(self.__create_cumsum_cols)
        return grouped[grouped['cumsum_net_value'] > monthly_limit]

    def __create_cumsum_cols(self, subset):
        subset['cumsum_net_value'] = subset['net_value_int'].cumsum()
        return subset