yzhao062/Pyod

View on GitHub
notebooks/Model Combination.ipynb

Summary

Maintainability
Test Coverage
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example of combining multiple base outlier scores. \n",
    "\n",
    "**[PyOD](https://github.com/yzhao062/Pyod)** is a comprehensive **Python toolkit** to **identify outlying objects** in \n",
    "multivariate data with both unsupervised and supervised approaches.\n",
    "\n",
    "Four combination frameworks are demonstrated in this example:\n",
    "\n",
    "1. Average: take the average of all base detectors\n",
    "2. maximization : take the maximum score across all detectors as the score\n",
    "3. Average of Maximum (AOM)\n",
    "4. Maximum of Average (MOA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "\n",
    "import os\n",
    "import sys\n",
    "\n",
    "# temporary solution for relative imports in case pyod is not installed\n",
    "# if pyod is installed, no need to use the following line\n",
    "sys.path.append(os.path.abspath(os.path.join(os.path.dirname(\"__file__\"), '..')))\n",
    "\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.io import loadmat\n",
    "\n",
    "from pyod.models.knn import KNN\n",
    "from pyod.models.combination import aom, moa, average, maximization\n",
    "from pyod.utils.utility import standardizer\n",
    "from pyod.utils.data import generate_data\n",
    "from pyod.utils.data import evaluate_print"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Define data file and read X and y\n",
    "# Generate some data if the source data is missing\n",
    "mat_file = 'cardio.mat'\n",
    "\n",
    "try:\n",
    "    mat = loadmat(os.path.join('data', mat_file))\n",
    "\n",
    "except TypeError:\n",
    "    print('{data_file} does not exist. Use generated data'.format(\n",
    "        data_file=mat_file))\n",
    "    X, y = generate_data(train_only=True)  # load data\n",
    "except IOError:\n",
    "    print('{data_file} does not exist. Use generated data'.format(\n",
    "        data_file=mat_file))\n",
    "    X, y = generate_data(train_only=True)  # load data\n",
    "else:\n",
    "    X = mat['X']\n",
    "    y = mat['y'].ravel()\n",
    "    \n",
    "# 60% data for training and 40% for testing\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)\n",
    "\n",
    "# standardizing data for processing\n",
    "X_train_norm, X_test_norm = standardizer(X_train, X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data: (1098, 21) (1098,)\n",
      "Test data: (733, 21) (733,)\n"
     ]
    }
   ],
   "source": [
    "print(\"Training data:\", X_train.shape, y_train.shape)\n",
    "print(\"Test data:\", X_test.shape, y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initializing 20 kNN detectors\n",
      "Base detector 0 is fitted for prediction\n",
      "Base detector 1 is fitted for prediction\n",
      "Base detector 2 is fitted for prediction\n",
      "Base detector 3 is fitted for prediction\n",
      "Base detector 4 is fitted for prediction\n",
      "Base detector 5 is fitted for prediction\n",
      "Base detector 6 is fitted for prediction\n",
      "Base detector 7 is fitted for prediction\n",
      "Base detector 8 is fitted for prediction\n",
      "Base detector 9 is fitted for prediction\n",
      "Base detector 10 is fitted for prediction\n",
      "Base detector 11 is fitted for prediction\n",
      "Base detector 12 is fitted for prediction\n",
      "Base detector 13 is fitted for prediction\n",
      "Base detector 14 is fitted for prediction\n",
      "Base detector 15 is fitted for prediction\n",
      "Base detector 16 is fitted for prediction\n",
      "Base detector 17 is fitted for prediction\n",
      "Base detector 18 is fitted for prediction\n",
      "Base detector 19 is fitted for prediction\n"
     ]
    }
   ],
   "source": [
    "n_clf = 20  # number of base detectors\n",
    "\n",
    "# Initialize 20 base detectors for combination\n",
    "k_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140,\n",
    "          150, 160, 170, 180, 190, 200]\n",
    "\n",
    "train_scores = np.zeros([X_train.shape[0], n_clf])\n",
    "test_scores = np.zeros([X_test.shape[0], n_clf])\n",
    "\n",
    "print('Initializing {n_clf} kNN detectors'.format(n_clf=n_clf))\n",
    "\n",
    "for i in range(n_clf):\n",
    "    k = k_list[i]\n",
    "\n",
    "    clf = KNN(n_neighbors=k, method='largest')\n",
    "    clf.fit(X_train_norm)\n",
    "\n",
    "    train_scores[:, i] = clf.decision_scores_\n",
    "    test_scores[:, i] = clf.decision_function(X_test_norm)\n",
    "    print('Base detector %i is fitted for prediction' % i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Decision score matrix on training data (1098, 20)\n",
      "Decision score matrix on test data (733, 20)\n"
     ]
    }
   ],
   "source": [
    "# Decision scores have to be normalized before combination\n",
    "train_scores_norm, test_scores_norm = standardizer(train_scores,\n",
    "                                                   test_scores)\n",
    "\n",
    "# Predicted scores from all base detectors on the test data is \n",
    "# stored in train_scores_norm and test_scores_norm\n",
    "print('Decision score matrix on training data', train_scores_norm.shape)\n",
    "print('Decision score matrix on test data', test_scores_norm.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Combination by Average ROC:0.9017, precision @ rank n:0.5342\n",
      "Combination by Maximization ROC:0.9024, precision @ rank n:0.5616\n",
      "Combination by AOM ROC:0.9073, precision @ rank n:0.5342\n",
      "Combination by MOA ROC:0.91, precision @ rank n:0.5342\n"
     ]
    }
   ],
   "source": [
    "# Combine and evaluate the combination result\n",
    "\n",
    "# Combination by average\n",
    "y_by_average = average(test_scores_norm)\n",
    "evaluate_print('Combination by Average', y_test, y_by_average)\n",
    "\n",
    "# Combination by max\n",
    "y_by_maximization = maximization(test_scores_norm)\n",
    "evaluate_print('Combination by Maximization', y_test, y_by_maximization)\n",
    "\n",
    "# Combination by aom\n",
    "y_by_aom = aom(test_scores_norm, n_buckets=5)\n",
    "evaluate_print('Combination by AOM', y_test, y_by_aom)\n",
    "\n",
    "# Combination by moa\n",
    "y_by_moa = moa(test_scores_norm, n_buckets=5)\n",
    "evaluate_print('Combination by MOA', y_test, y_by_moa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}