radar_parlamentar/analises/pca.py
# !/usr/bin/env python
""" a small class for Principal Component Analysis
Usage:
p = PCA( A, fraction=0.90 )
In:
A:an array of e.g. 1000 observations x 20 variables, 1000 rows x 20 columns
fraction: use principal components that account for e.g.
90 % of the total variance
Out:
p.U, p.d, p.Vt: from numpy.linalg.svd, A = U . d . Vt
p.dinv: 1/d or 0, see NR
p.eigen: the eigenvalues of A*A, in decreasing order (p.d**2).
eigen[j] / eigen.sum() is variable j's fraction of the total variance;
look at the first few eigen[] to see how many PCs get to 90 %, 95 % ...
p.npc: number of principal components,
e.g. 2 if the top 2 eigenvalues are >= `fraction` of the total.
It's ok to change this; methods use the current value.
Methods:
The methods of class PCA transform vectors or arrays of e.g.
20 variables, 2 principal components and 1000 observations,
using partial matrices U' d' Vt', parts of the full U d Vt:
A ~ U' . d' . Vt' where e.g.
U' is 1000 x 2
d' is diag([ d0, d1 ]), the 2 largest singular values
Vt' is 2 x 20. Dropping the primes,
d . Vt 2 principal vars = p.vars_pc( 20 vars )
U 1000 obs = p.pc_obs( 2 principal vars )
U . d . Vt 1000 obs, p.obs( 20 vars ) = pc_obs( vars_pc( vars ))
fast approximate A . vars, using the `npc` principal components
Ut 2 pcs = p.obs_pc( 1000 obs )
V . dinv 20 vars = p.pc_vars( 2 principal vars )
V . dinv . Ut 20 vars, p.vars( 1000 obs ) = pc_vars( obs_pc( obs )),
fast approximate Ainverse . obs: vars that give ~ those obs.
Notes:
PCA does not center or scale A; you usually want to first
A -= A.mean(A, axis=0)
A /= A.std(A, axis=0)
with the little class Center or the like, below.
See also:
http://en.wikipedia.org/wiki/Principal_component_analysis
http://en.wikipedia.org/wiki/Singular_value_decomposition
Press et al., Numerical Recipes (2 or 3 ed), SVD
PCA micro-tutorial
iris-pca .py .png
Source: http://stackoverflow.com/questions/1730600/principal-component-analysis
-in-python
"""
import numpy as np
import logging
dot = np.dot
# import bz.numpyutil as nu
# dot = nu.pdot
__version__ = "2010-04-14 apr"
__author_email__ = "denis-bz-py at t-online dot de"
logger = logging.getLogger("radar")
'#.........................................................................'
class PCA:
def __init__(self, A, fraction=0.90):
assert 0 <= fraction <= 1
# A = U . diag(d) . Vt, O( m n^2 ), lapack_lite --
self.U, self.d, self.Vt = np.linalg.svd(A, full_matrices=False)
assert np.all(self.d[:-1] >= self.d[1:]) # sorted
self.eigen = self.d ** 2
self.sumvariance = np.cumsum(self.eigen)
self.sumvariance /= self.sumvariance[-1]
self.npc = np.searchsorted(self.sumvariance, fraction) + 1
self.dinv = np.array([1 / d if d > self.d[0] * 1e-6 else 0
for d in self.d])
def pc(self):
""" e.g. 1000 x 2 U[:, :npc] * d[:npc], to plot etc. """
n = self.npc
return self.U[:, :n] * self.d[:n]
# These 1-line methods may not be worth the bother;
# then use U d Vt directly --
def vars_pc(self, x):
n = self.npc
return self.d[:n] * dot(self.Vt[:n], x.T).T # 20 vars -> 2 principal
def pc_vars(self, p):
n = self.npc
return dot(self.Vt[:n].T, (self.dinv[:n] * p).T) .T # 2 PC -> 20 vars
def pc_obs(self, p):
n = self.npc
return dot(self.U[:, :n], p.T) # 2 principal -> 1000 obs
def obs_pc(self, obs):
n = self.npc
return dot(self.U[:, :n].T, obs) .T # 1000 obs -> 2 principal
def obs(self, x):
# 20 vars -> 2 principal -> 1000 obs
return self.pc_obs(self.vars_pc(x))
def vars(self, obs):
# 1000 obs -> 2 principal -> 20 vars
return self.pc_vars(self.obs_pc(obs))
class Center:
""" A -= A.mean() /= A.std(), inplace -- use A.copy() if need be
uncenter(x) == original A . x
"""
# mttiw
def __init__(self, A, axis=0, scale=True, verbose=1):
self.mean = A.mean(axis=axis)
if verbose:
print("Center -= A.mean:", self.mean)
A -= self.mean
if scale:
std = A.std(axis=axis)
self.std = np.where(std, std, 1.)
if verbose:
print("Center /= A.std:", self.std)
A /= self.std
else:
self.std = np.ones(A.shape[-1])
self.A = A
def uncenter(self, x):
return np.dot(self.A, x * self.std) + np.dot(x, self.mean)
'#.........................................................................'""
if __name__ == "__main__":
import sys
csv = "iris4.csv" # wikipedia Iris_flower_data_set
# 5.1,3.5,1.4,0.2 # ,Iris-setosa ...
N = 1000
K = 20
fraction = .90
seed = 1
exec("\n".join(sys.argv[1:])) # N= ...
np.random.seed(seed)
np.set_printoptions(1, threshold=100, suppress=True) # .1f
try:
A = np.genfromtxt(csv, delimiter=",")
N, K = A.shape
except IOError as error:
A = np.random.normal(size=(N, K)) # gen correlated ?
logger.error("IOError: %s" % error)
print("csv: %s N: %d K: %d fraction: %.2g" % (csv, N, K, fraction))
Center(A)
print("A:", A)
print("PCA ... \n")
p = PCA(A, fraction=fraction)
print("npc:", p.npc)
print("% variance:", p.sumvariance * 100)
print("Vt[0], weights that give PC 0:", p.Vt[0])
print("A . Vt[0]:", dot(A, p.Vt[0]))
print("pc:", p.pc())
print("\nobs <-> pc <-> x: with fraction=1, diffs should be ~ 0")
x = np.ones(K)
# x = np.ones(( 3, K ))
print("x:", x)
pc = p.vars_pc(x) # d' Vt' x
print("vars_pc(x):", pc)
print("back to ~ x:", p.pc_vars(pc))
Ax = dot(A, x.T)
pcx = p.obs(x) # U' d' Vt' x
print("Ax:", Ax)
print("A'x:", pcx)
print("max |Ax - A'x|: %.2g" % np.linalg.norm(Ax - pcx, np.inf))
b = Ax # ~ back to original x, Ainv A x
back = p.vars(b)
print("~ back again:", back)
print("max |back - x|: %.2g" % np.linalg.norm(back - x, np.inf))