Skip to content

Commit fcea2f7

Browse files
committed
outlierlib
1 parent 35b64f7 commit fcea2f7

File tree

7 files changed

+687
-6
lines changed

7 files changed

+687
-6
lines changed

PCA/robust_pca.ipynb

+26-6
Large diffs are not rendered by default.

img/norm.png

18 KB
Loading

img/proj.png

208 KB
Loading
11.2 KB
Binary file not shown.

lib/outlier.py

+282
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
import pandas as pd
4+
import matplotlib.patches as mpatches
5+
import random
6+
import math
7+
import scipy.spatial.distance as distance
8+
from sklearn.covariance import MinCovDet as MCD
9+
import scipy.stats as stats
10+
from numpy import linalg as LA
11+
import progressbar
12+
from pathos.multiprocessing import ProcessingPool as Pool
13+
from sklearn.cluster import KMeans
14+
15+
16+
class OutlierMahalanobis(object):
17+
18+
def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995,qqplot=True):
19+
self.verbose = verbose
20+
self.support_fraction = support_fraction
21+
self.chi2 = stats.chi2
22+
self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
23+
self.chi2_percentile = chi2_percentile
24+
self.qqplot=qqplot
25+
26+
def fit(self, X):
27+
"""Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
28+
self.mcd.fit(X)
29+
d = np.array([distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) for p in X])
30+
self.d2 = d**2 #MD squared
31+
n, self.degrees_of_freedom_ = X.shape
32+
self.iextreme_values = (self.d2 > self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) )
33+
if self.verbose:
34+
print("%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile))
35+
print("with support fraction %.2f."%self.support_fraction)
36+
pvalue=stats.kstest(self.d2, lambda x : stats.chi2.cdf(x,df=self.degrees_of_freedom_))[1]
37+
if pvalue <= 0.01:
38+
print('Attention : Très forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue))
39+
elif pvalue <= 0.05:
40+
print('Attention : Forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue))
41+
elif pvalue <= 0.1:
42+
print('Faible présomption contre l\'hypothèse nulle p_value : '+str(pvalue))
43+
else :
44+
print('Pas de présomption contre l\'hypothèse nulle. p_value : '+str(pvalue))
45+
if self.qqplot==True :
46+
plt.figure(figsize=(10,10))
47+
stats.probplot(self.d2,dist=stats.chi2(df=self.degrees_of_freedom_), plot=plt)
48+
plt.title('QQ plot between Mahanalobis distance quantiles and Chi2 quantiles')
49+
plt.show()
50+
51+
return self
52+
53+
def plot(self,log=False, sort = False ):
54+
"""
55+
Cause plotting is always fun.
56+
57+
log: transform the distance-sq to a log ( distance-sq )
58+
sort: sort the data according to distnace before plotting
59+
ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.
60+
61+
"""
62+
n = self.d2.shape[0]
63+
fig = plt.figure(figsize=(10,10))
64+
65+
x = np.arange( n )
66+
ax = fig.add_subplot(111)
67+
68+
69+
transform = (lambda x: x ) if not log else (lambda x: np.log(x))
70+
chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_)
71+
72+
chi_line = transform( chi_line )
73+
d2 = transform( self.d2 )
74+
if sort:
75+
isort = np.argsort( d2 )
76+
ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' )
77+
plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" )
78+
79+
80+
else:
81+
ax.scatter(x, d2 )
82+
extreme_values = d2[ self.iextreme_values ]
83+
ax.scatter( x[self.iextreme_values], extreme_values, color="r" )
84+
85+
ax.hlines( chi_line, 0, n,
86+
label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" )
87+
88+
ax.legend()
89+
ax.set_ylabel("distance squared")
90+
ax.set_xlabel("observation")
91+
ax.set_xlim(0, self.d2.shape[0])
92+
93+
94+
plt.show()
95+
96+
class R_pca:
97+
98+
def __init__(self, D, mu=None, lmbda=None):
99+
self.D = D
100+
self.S = np.zeros(self.D.shape)
101+
self.Y = np.zeros(self.D.shape)
102+
103+
if mu:
104+
self.mu = mu
105+
else:
106+
self.mu = np.prod(self.D.shape) / (4 * self.norm_p(self.D, 2))
107+
108+
self.mu_inv = 1 / self.mu
109+
110+
if lmbda:
111+
self.lmbda = lmbda
112+
else:
113+
self.lmbda = 1 / np.sqrt(np.max(self.D.shape))
114+
115+
@staticmethod
116+
def norm_p(M, p):
117+
return np.sum(np.power(M, p))
118+
119+
@staticmethod
120+
def shrink(M, tau):
121+
return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape))
122+
123+
def svd_threshold(self, M, tau):
124+
U, S, V = np.linalg.svd(M, full_matrices=False)
125+
return np.dot(U, np.dot(np.diag(self.shrink(S, tau)), V))
126+
127+
def fit(self, tol=None, max_iter=1000, iter_print=100):
128+
iter = 0
129+
err = np.Inf
130+
Sk = self.S
131+
Yk = self.Y
132+
Lk = np.zeros(self.D.shape)
133+
134+
if tol:
135+
_tol = tol
136+
else:
137+
_tol = 1E-7 * self.norm_p(np.abs(self.D), 2)
138+
139+
while (err > _tol) and iter < max_iter:
140+
Lk = self.svd_threshold(
141+
self.D - Sk + self.mu_inv * Yk, self.mu_inv)
142+
Sk = self.shrink(
143+
self.D - Lk + (self.mu_inv * Yk), self.mu_inv * self.lmbda)
144+
Yk = Yk + self.mu * (self.D - Lk - Sk)
145+
err = self.norm_p(np.abs(self.D - Lk - Sk), 2)
146+
iter += 1
147+
if (iter % iter_print) == 0 or iter == 1 or iter > max_iter or err <= _tol:
148+
print('iteration: {0}, error: {1}'.format(iter, err))
149+
150+
self.L = Lk
151+
self.S = Sk
152+
return Lk, Sk
153+
154+
def p_outliers(self,p):
155+
norm=LA.norm(self.S,axis=0)
156+
return np.argsort(norm)[::-1][0:p]
157+
158+
def plot_normC(self):
159+
norm=np.sort(LA.norm(self.S,axis=0))[::-1]
160+
plt.figure(figsize=(10,10))
161+
plt.plot(norm,'r-')
162+
plt.legend(['Norme 2 de $C_{0}$'],fontsize=15)
163+
plt.title('Norme 2 de $C_{0}$ pour chaque point',fontsize=15)
164+
plt.show()
165+
166+
def plot_fit(self, size=None, tol=0.1, axis_on=True):
167+
168+
n, d = self.D.shape
169+
170+
if size:
171+
nrows, ncols = size
172+
else:
173+
sq = np.ceil(np.sqrt(n))
174+
nrows = int(sq)
175+
ncols = int(sq)
176+
177+
ymin = np.nanmin(self.D)
178+
ymax = np.nanmax(self.D)
179+
print('ymin: {0}, ymax: {1}'.format(ymin, ymax))
180+
181+
numplots = np.min([n, nrows * ncols])
182+
plt.figure(figsize=size)
183+
184+
for n in range(numplots):
185+
plt.subplot(nrows, ncols, n + 1)
186+
plt.ylim((ymin - tol, ymax + tol))
187+
plt.plot(self.L[n, :] + self.S[n, :], 'r')
188+
plt.plot(self.L[n, :], 'b')
189+
if not axis_on:
190+
plt.axis('off')
191+
192+
193+
class OutliersKmeans:
194+
''' v.0.2 OutliersKmeans : Find outliers using Kmeans. Only for semi-supervised outlier detection'''
195+
def __init__(self,normaldata,kmeans,parallel=False):
196+
self.normaldata=normaldata
197+
self.kmeans=kmeans
198+
self.parallel=parallel
199+
200+
def sleep(self,delay):
201+
if __name__ != '__main__':
202+
delay /= non_interactive_sleep_factor
203+
time.sleep(delay)
204+
205+
206+
def to_center_distances(self,X,mu):
207+
D=[]
208+
for i in range(len(X)):
209+
D.append(np.sum((X[i]-mu)**2))
210+
return D
211+
212+
def find_extreme_point(self,centers,ypred,X):
213+
K=len(centers)
214+
d={}
215+
for i in range(K):
216+
X_in_cluster=X[np.where(ypred==i),:][0]
217+
D=self.to_center_distances(X_in_cluster,centers[i])
218+
indexmax=np.argmax(D)
219+
d[i+1]=(X[indexmax],math.sqrt(np.max(D)))
220+
return d
221+
222+
def draw_circle(self,Npoints,radius,center):
223+
circle_slice=2*math.pi/Npoints
224+
p=[]
225+
for i in range(Npoints):
226+
angle=circle_slice*i
227+
newx=center[0]+radius*math.cos(angle)
228+
newy=center[1]+radius*math.sin(angle)
229+
p.append([newx,newy])
230+
return np.array(p)
231+
232+
def bounds(self,centers,ypred,Xreduced,Npoints=100):
233+
d=self.find_extreme_point(centers,ypred,Xreduced)
234+
K=len(d)
235+
circles=[]
236+
for i in range(K):
237+
center=centers[i]
238+
radius=d[i+1][1]
239+
circles.append(self.draw_circle(Npoints,radius,center))
240+
return circles
241+
242+
def in_circle(self,point,center,radius):
243+
return math.sqrt(np.sum((point-center)**2))<=radius
244+
245+
def in_any_circle(self,point,centers,d):
246+
K=len(d)
247+
incirclek=[]
248+
for k in range(K):
249+
center=centers[k]
250+
radius=d[k+1][1]
251+
incirclek.append(self.in_circle(point,center,radius))
252+
return np.any(incirclek)
253+
254+
255+
def is_outlier(self,points,showbar=True):
256+
self.kmeans.fit(self.normaldata)
257+
y=self.kmeans.predict(self.normaldata)
258+
centers=self.kmeans.cluster_centers_
259+
d=self.find_extreme_point(centers,y,self.normaldata)
260+
result=[]
261+
if showbar == True :
262+
widgets = [progressbar.Percentage(), progressbar.Bar()]
263+
bar = progressbar.ProgressBar(widgets=widgets, max_value=len(points)).start()
264+
j=0
265+
if self.parallel==True:
266+
p=Pool(4)
267+
g=lambda point:not(self.in_any_circle(point,centers,d))
268+
result=p.map(g, outliers)
269+
270+
else:
271+
for point in points:
272+
273+
result.append(not(self.in_any_circle(point,centers,d)))
274+
275+
if showbar== True :
276+
self.sleep(0.1)
277+
bar.update(j)
278+
j=j+1
279+
280+
if showbar== True :
281+
bar.finish()
282+
return result
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cells": [],
3+
"metadata": {},
4+
"nbformat": 4,
5+
"nbformat_minor": 2
6+
}

0 commit comments

Comments
 (0)