catwalk/individual_importance/uniform.py from dssg/catwalk

catwalk/individual_importance/uniform.py
Summary

Maintainability

1 hr
Test Coverage

Issues
def _entity_feature_values(matrix, feature_name, as_of_date=None):
    """Finds the value of the given feature for each entity in a matrix

    Args:
        matrix (pandas.DataFrame), with index either 'entity_id' or 'entity_id'/'as_of_date'
        feature_name (string) The name of a column in the matrix
        as_of_date (optional) If the matrix is indexed with as_of_date,
            then this must be one of the valid as_of_dates in the matrix

    Returns: (list) of (entity_id, feature_value) tuples
    """
    if matrix.index.names == ['entity_id']:
        return list(zip(matrix.index.values, matrix[feature_name].tolist()))
    elif 'entity_id' in matrix.index.names:
        results = []
        index_of_entity = matrix.index.names.index('entity_id')
        index_of_date = matrix.index.names.index('as_of_date')
        for row in zip(matrix.index.values, matrix[feature_name].tolist()):
            index_values, feature_value = row
            entity_id = index_values[index_of_entity]
            if index_values[index_of_date] == as_of_date:
                results.append((entity_id, feature_value))
        return results
    else:
        return []


def uniform_distribution(db_engine, model_id, as_of_date, test_matrix_store, n_ranks):
    """Calculates individual feature importances based on the global feature importances

    Args:
        db_engine (sqlalchemy.engine)
        model_id (int) A model id, expected to be present in results.models
        as_of_date (datetime or string) The date to produce individual importances as of
        test_matrix_store (catwalk.storage.MatrixStore) The test matrix
        n_ranks (int) Number of ranks to calculate and save. Defaults to 5

    Returns: (list) dicts with entity_id, feature_value, feature_name, score
    """
    global_feature_importances = [row for row in db_engine.execute(
        '''select feature, feature_importance
        from results.feature_importances where model_id = %s
        order by feature_importance desc limit %s''',
        model_id,
        n_ranks
    )]

    results = []

    for feature_name, feature_importance in global_feature_importances:
        efv = _entity_feature_values(test_matrix_store.matrix, feature_name, as_of_date)
        for entity_id, feature_value in efv:
            results.append({
                'entity_id': entity_id,
                'feature_value': feature_value,
                'feature_name': feature_name,
                'score': feature_importance,
            })
    return results