|
| 1 | +import numpy as np |
| 2 | +import matplotlib.pyplot as plt |
| 3 | +import pandas as pd |
| 4 | +import matplotlib.patches as mpatches |
| 5 | +import random |
| 6 | +import math |
| 7 | +import scipy.spatial.distance as distance |
| 8 | +from sklearn.covariance import MinCovDet as MCD |
| 9 | +import scipy.stats as stats |
| 10 | +from numpy import linalg as LA |
| 11 | +import progressbar |
| 12 | +from pathos.multiprocessing import ProcessingPool as Pool |
| 13 | +from sklearn.cluster import KMeans |
| 14 | + |
| 15 | + |
| 16 | +class OutlierMahalanobis(object): |
| 17 | + |
| 18 | + def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995,qqplot=True): |
| 19 | + self.verbose = verbose |
| 20 | + self.support_fraction = support_fraction |
| 21 | + self.chi2 = stats.chi2 |
| 22 | + self.mcd = MCD(store_precision = True, support_fraction = support_fraction) |
| 23 | + self.chi2_percentile = chi2_percentile |
| 24 | + self.qqplot=qqplot |
| 25 | + |
| 26 | + def fit(self, X): |
| 27 | + """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" |
| 28 | + self.mcd.fit(X) |
| 29 | + d = np.array([distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) for p in X]) |
| 30 | + self.d2 = d**2 #MD squared |
| 31 | + n, self.degrees_of_freedom_ = X.shape |
| 32 | + self.iextreme_values = (self.d2 > self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) ) |
| 33 | + if self.verbose: |
| 34 | + print("%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)) |
| 35 | + print("with support fraction %.2f."%self.support_fraction) |
| 36 | + pvalue=stats.kstest(self.d2, lambda x : stats.chi2.cdf(x,df=self.degrees_of_freedom_))[1] |
| 37 | + if pvalue <= 0.01: |
| 38 | + print('Attention : Très forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue)) |
| 39 | + elif pvalue <= 0.05: |
| 40 | + print('Attention : Forte présomption contre l\'hypothèse nulle p_value : '+str(pvalue)) |
| 41 | + elif pvalue <= 0.1: |
| 42 | + print('Faible présomption contre l\'hypothèse nulle p_value : '+str(pvalue)) |
| 43 | + else : |
| 44 | + print('Pas de présomption contre l\'hypothèse nulle. p_value : '+str(pvalue)) |
| 45 | + if self.qqplot==True : |
| 46 | + plt.figure(figsize=(10,10)) |
| 47 | + stats.probplot(self.d2,dist=stats.chi2(df=self.degrees_of_freedom_), plot=plt) |
| 48 | + plt.title('QQ plot between Mahanalobis distance quantiles and Chi2 quantiles') |
| 49 | + plt.show() |
| 50 | + |
| 51 | + return self |
| 52 | + |
| 53 | + def plot(self,log=False, sort = False ): |
| 54 | + """ |
| 55 | + Cause plotting is always fun. |
| 56 | +
|
| 57 | + log: transform the distance-sq to a log ( distance-sq ) |
| 58 | + sort: sort the data according to distnace before plotting |
| 59 | + ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views. |
| 60 | +
|
| 61 | + """ |
| 62 | + n = self.d2.shape[0] |
| 63 | + fig = plt.figure(figsize=(10,10)) |
| 64 | + |
| 65 | + x = np.arange( n ) |
| 66 | + ax = fig.add_subplot(111) |
| 67 | + |
| 68 | + |
| 69 | + transform = (lambda x: x ) if not log else (lambda x: np.log(x)) |
| 70 | + chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) |
| 71 | + |
| 72 | + chi_line = transform( chi_line ) |
| 73 | + d2 = transform( self.d2 ) |
| 74 | + if sort: |
| 75 | + isort = np.argsort( d2 ) |
| 76 | + ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' ) |
| 77 | + plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" ) |
| 78 | + |
| 79 | + |
| 80 | + else: |
| 81 | + ax.scatter(x, d2 ) |
| 82 | + extreme_values = d2[ self.iextreme_values ] |
| 83 | + ax.scatter( x[self.iextreme_values], extreme_values, color="r" ) |
| 84 | + |
| 85 | + ax.hlines( chi_line, 0, n, |
| 86 | + label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" ) |
| 87 | + |
| 88 | + ax.legend() |
| 89 | + ax.set_ylabel("distance squared") |
| 90 | + ax.set_xlabel("observation") |
| 91 | + ax.set_xlim(0, self.d2.shape[0]) |
| 92 | + |
| 93 | + |
| 94 | + plt.show() |
| 95 | + |
| 96 | +class R_pca: |
| 97 | + |
| 98 | + def __init__(self, D, mu=None, lmbda=None): |
| 99 | + self.D = D |
| 100 | + self.S = np.zeros(self.D.shape) |
| 101 | + self.Y = np.zeros(self.D.shape) |
| 102 | + |
| 103 | + if mu: |
| 104 | + self.mu = mu |
| 105 | + else: |
| 106 | + self.mu = np.prod(self.D.shape) / (4 * self.norm_p(self.D, 2)) |
| 107 | + |
| 108 | + self.mu_inv = 1 / self.mu |
| 109 | + |
| 110 | + if lmbda: |
| 111 | + self.lmbda = lmbda |
| 112 | + else: |
| 113 | + self.lmbda = 1 / np.sqrt(np.max(self.D.shape)) |
| 114 | + |
| 115 | + @staticmethod |
| 116 | + def norm_p(M, p): |
| 117 | + return np.sum(np.power(M, p)) |
| 118 | + |
| 119 | + @staticmethod |
| 120 | + def shrink(M, tau): |
| 121 | + return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape)) |
| 122 | + |
| 123 | + def svd_threshold(self, M, tau): |
| 124 | + U, S, V = np.linalg.svd(M, full_matrices=False) |
| 125 | + return np.dot(U, np.dot(np.diag(self.shrink(S, tau)), V)) |
| 126 | + |
| 127 | + def fit(self, tol=None, max_iter=1000, iter_print=100): |
| 128 | + iter = 0 |
| 129 | + err = np.Inf |
| 130 | + Sk = self.S |
| 131 | + Yk = self.Y |
| 132 | + Lk = np.zeros(self.D.shape) |
| 133 | + |
| 134 | + if tol: |
| 135 | + _tol = tol |
| 136 | + else: |
| 137 | + _tol = 1E-7 * self.norm_p(np.abs(self.D), 2) |
| 138 | + |
| 139 | + while (err > _tol) and iter < max_iter: |
| 140 | + Lk = self.svd_threshold( |
| 141 | + self.D - Sk + self.mu_inv * Yk, self.mu_inv) |
| 142 | + Sk = self.shrink( |
| 143 | + self.D - Lk + (self.mu_inv * Yk), self.mu_inv * self.lmbda) |
| 144 | + Yk = Yk + self.mu * (self.D - Lk - Sk) |
| 145 | + err = self.norm_p(np.abs(self.D - Lk - Sk), 2) |
| 146 | + iter += 1 |
| 147 | + if (iter % iter_print) == 0 or iter == 1 or iter > max_iter or err <= _tol: |
| 148 | + print('iteration: {0}, error: {1}'.format(iter, err)) |
| 149 | + |
| 150 | + self.L = Lk |
| 151 | + self.S = Sk |
| 152 | + return Lk, Sk |
| 153 | + |
| 154 | + def p_outliers(self,p): |
| 155 | + norm=LA.norm(self.S,axis=0) |
| 156 | + return np.argsort(norm)[::-1][0:p] |
| 157 | + |
| 158 | + def plot_normC(self): |
| 159 | + norm=np.sort(LA.norm(self.S,axis=0))[::-1] |
| 160 | + plt.figure(figsize=(10,10)) |
| 161 | + plt.plot(norm,'r-') |
| 162 | + plt.legend(['Norme 2 de $C_{0}$'],fontsize=15) |
| 163 | + plt.title('Norme 2 de $C_{0}$ pour chaque point',fontsize=15) |
| 164 | + plt.show() |
| 165 | + |
| 166 | + def plot_fit(self, size=None, tol=0.1, axis_on=True): |
| 167 | + |
| 168 | + n, d = self.D.shape |
| 169 | + |
| 170 | + if size: |
| 171 | + nrows, ncols = size |
| 172 | + else: |
| 173 | + sq = np.ceil(np.sqrt(n)) |
| 174 | + nrows = int(sq) |
| 175 | + ncols = int(sq) |
| 176 | + |
| 177 | + ymin = np.nanmin(self.D) |
| 178 | + ymax = np.nanmax(self.D) |
| 179 | + print('ymin: {0}, ymax: {1}'.format(ymin, ymax)) |
| 180 | + |
| 181 | + numplots = np.min([n, nrows * ncols]) |
| 182 | + plt.figure(figsize=size) |
| 183 | + |
| 184 | + for n in range(numplots): |
| 185 | + plt.subplot(nrows, ncols, n + 1) |
| 186 | + plt.ylim((ymin - tol, ymax + tol)) |
| 187 | + plt.plot(self.L[n, :] + self.S[n, :], 'r') |
| 188 | + plt.plot(self.L[n, :], 'b') |
| 189 | + if not axis_on: |
| 190 | + plt.axis('off') |
| 191 | + |
| 192 | + |
| 193 | +class OutliersKmeans: |
| 194 | + ''' v.0.2 OutliersKmeans : Find outliers using Kmeans. Only for semi-supervised outlier detection''' |
| 195 | + def __init__(self,normaldata,kmeans,parallel=False): |
| 196 | + self.normaldata=normaldata |
| 197 | + self.kmeans=kmeans |
| 198 | + self.parallel=parallel |
| 199 | + |
| 200 | + def sleep(self,delay): |
| 201 | + if __name__ != '__main__': |
| 202 | + delay /= non_interactive_sleep_factor |
| 203 | + time.sleep(delay) |
| 204 | + |
| 205 | + |
| 206 | + def to_center_distances(self,X,mu): |
| 207 | + D=[] |
| 208 | + for i in range(len(X)): |
| 209 | + D.append(np.sum((X[i]-mu)**2)) |
| 210 | + return D |
| 211 | + |
| 212 | + def find_extreme_point(self,centers,ypred,X): |
| 213 | + K=len(centers) |
| 214 | + d={} |
| 215 | + for i in range(K): |
| 216 | + X_in_cluster=X[np.where(ypred==i),:][0] |
| 217 | + D=self.to_center_distances(X_in_cluster,centers[i]) |
| 218 | + indexmax=np.argmax(D) |
| 219 | + d[i+1]=(X[indexmax],math.sqrt(np.max(D))) |
| 220 | + return d |
| 221 | + |
| 222 | + def draw_circle(self,Npoints,radius,center): |
| 223 | + circle_slice=2*math.pi/Npoints |
| 224 | + p=[] |
| 225 | + for i in range(Npoints): |
| 226 | + angle=circle_slice*i |
| 227 | + newx=center[0]+radius*math.cos(angle) |
| 228 | + newy=center[1]+radius*math.sin(angle) |
| 229 | + p.append([newx,newy]) |
| 230 | + return np.array(p) |
| 231 | + |
| 232 | + def bounds(self,centers,ypred,Xreduced,Npoints=100): |
| 233 | + d=self.find_extreme_point(centers,ypred,Xreduced) |
| 234 | + K=len(d) |
| 235 | + circles=[] |
| 236 | + for i in range(K): |
| 237 | + center=centers[i] |
| 238 | + radius=d[i+1][1] |
| 239 | + circles.append(self.draw_circle(Npoints,radius,center)) |
| 240 | + return circles |
| 241 | + |
| 242 | + def in_circle(self,point,center,radius): |
| 243 | + return math.sqrt(np.sum((point-center)**2))<=radius |
| 244 | + |
| 245 | + def in_any_circle(self,point,centers,d): |
| 246 | + K=len(d) |
| 247 | + incirclek=[] |
| 248 | + for k in range(K): |
| 249 | + center=centers[k] |
| 250 | + radius=d[k+1][1] |
| 251 | + incirclek.append(self.in_circle(point,center,radius)) |
| 252 | + return np.any(incirclek) |
| 253 | + |
| 254 | + |
| 255 | + def is_outlier(self,points,showbar=True): |
| 256 | + self.kmeans.fit(self.normaldata) |
| 257 | + y=self.kmeans.predict(self.normaldata) |
| 258 | + centers=self.kmeans.cluster_centers_ |
| 259 | + d=self.find_extreme_point(centers,y,self.normaldata) |
| 260 | + result=[] |
| 261 | + if showbar == True : |
| 262 | + widgets = [progressbar.Percentage(), progressbar.Bar()] |
| 263 | + bar = progressbar.ProgressBar(widgets=widgets, max_value=len(points)).start() |
| 264 | + j=0 |
| 265 | + if self.parallel==True: |
| 266 | + p=Pool(4) |
| 267 | + g=lambda point:not(self.in_any_circle(point,centers,d)) |
| 268 | + result=p.map(g, outliers) |
| 269 | + |
| 270 | + else: |
| 271 | + for point in points: |
| 272 | + |
| 273 | + result.append(not(self.in_any_circle(point,centers,d))) |
| 274 | + |
| 275 | + if showbar== True : |
| 276 | + self.sleep(0.1) |
| 277 | + bar.update(j) |
| 278 | + j=j+1 |
| 279 | + |
| 280 | + if showbar== True : |
| 281 | + bar.finish() |
| 282 | + return result |
0 commit comments