Skip to content

Commit a949672

Browse files
committed
More studies
1 parent 56f1bdc commit a949672

12 files changed

+3787
-89
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,7 @@ CroppedYaleNoisy/
66
autoencoder/autoencoder_h2o\.R
77

88
*.h5
9+
10+
*.gz
11+
12+
*.pkl

Distance based/.ipynb_checkpoints/Mahalanobis-checkpoint.ipynb

+329
Large diffs are not rendered by default.

Distance based/.ipynb_checkpoints/Ouliter_using Kmeans_2-checkpoint.ipynb

+1,137
Large diffs are not rendered by default.

Distance based/Mahalanobis.ipynb

+304
Large diffs are not rendered by default.
File renamed without changes.

Distance based/Ouliter_using Kmeans_2.ipynb

+1,145
Large diffs are not rendered by default.

RNN/.ipynb_checkpoints/Untitled-checkpoint.ipynb

-6
This file was deleted.

RNN/Autoencoder.ipynb

+230-49
Large diffs are not rendered by default.

RNN/Untitled.ipynb

-34
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# SVC clustering"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"Based on https://github.com/josiahw/SimpleSVClustering/blob/master/SimpleSVC.py"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 15,
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"import numpy\n",
24+
"import numpy.linalg\n",
25+
"import sklearn.datasets\n",
26+
"from matplotlib import pyplot"
27+
]
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": 2,
32+
"metadata": {
33+
"collapsed": true
34+
},
35+
"outputs": [],
36+
"source": [
37+
"def polyKernel(a,b,pwr):\n",
38+
" return numpy.dot(a,b)**pwr #numpy.dot(a,a) - numpy.dot(b,b) # -1 #"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 3,
44+
"metadata": {
45+
"collapsed": true
46+
},
47+
"outputs": [],
48+
"source": [
49+
"def rbfKernel(a,b,gamma):\n",
50+
" return numpy.exp(-gamma * numpy.linalg.norm(a - b))"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 13,
56+
"metadata": {},
57+
"outputs": [],
58+
"source": [
59+
"class SimpleSVClustering:\n",
60+
" w = None\n",
61+
" a = None\n",
62+
" b = None\n",
63+
" C = None\n",
64+
" sv = None\n",
65+
" kernel = None\n",
66+
" kargs = ()\n",
67+
" tolerance = None\n",
68+
" verbose = False\n",
69+
"\n",
70+
" def __init__(self,\n",
71+
" C,\n",
72+
" tolerance = 0.001,\n",
73+
" kernel = numpy.dot,\n",
74+
" kargs = ()\n",
75+
" ):\n",
76+
" \"\"\"\n",
77+
" The parameters are:\n",
78+
" - C: SVC cost\n",
79+
" - tolerance: gradient descent solution accuracy\n",
80+
" - kernel: the kernel function do use as k(a, b, *kargs)\n",
81+
" - kargs: extra parameters for the kernel\n",
82+
" \"\"\"\n",
83+
" self.C = C\n",
84+
" self.kernel = kernel\n",
85+
" self.tolerance = tolerance\n",
86+
" self.kargs = kargs\n",
87+
"\n",
88+
" def _checkClass(self, a, b, n_checks = 5):\n",
89+
" \"\"\"\n",
90+
" This does a straight line interpolation between a and b, using n_checks number of segments.\n",
91+
" It returns True if a and b are connected by a high probability region, false otherwise.\n",
92+
" NOTE: authors originally suggested 20 segments but that is SLOOOOOW, so we use 5. In practice it is pretty good.\n",
93+
" \"\"\"\n",
94+
" for i in numpy.arange(1.0/n_checks,1.0,1.0/n_checks):\n",
95+
" if self._predict(i*a + (1-i)*b) > self.b:\n",
96+
" return False\n",
97+
" return True\n",
98+
" #test = [bool(self._predict(i*a + (1-i)*b) <= self.b) for i in numpy.arange(1.0/n_checks,1.0,1.0/n_checks)]\n",
99+
" #return not False in test\n",
100+
"\n",
101+
" def _getAllClasses(self, X):\n",
102+
" \"\"\"\n",
103+
" Assign class labels to each vector based on connected graph components.\n",
104+
" TODO: The outputs of this should really be saved in order to embed new points into the clusters.\n",
105+
" \"\"\"\n",
106+
"\n",
107+
" #1: build the connected clusters\n",
108+
" unvisited = list(range(len(X)))\n",
109+
" clusters = []\n",
110+
" while len(unvisited):\n",
111+
" #create a new cluster with the first unvisited node\n",
112+
" c = [unvisited[0]]\n",
113+
" unvisited.pop(0)\n",
114+
" i = 0\n",
115+
" while i < len(c) and len(unvisited):\n",
116+
" #for all nodes in the cluster, add all connected unvisited nodes and remove them fromt he unvisited list\n",
117+
" unvisitedNew = []\n",
118+
" for j in unvisited:\n",
119+
" (c if self._checkClass(X[c[i],:],X[j,:]) else unvisitedNew).append(j)\n",
120+
" unvisited = unvisitedNew\n",
121+
" i += 1\n",
122+
" clusters.append(c)\n",
123+
" \n",
124+
" #3: group components by classification\n",
125+
" classifications = numpy.zeros(len(X))-1\n",
126+
" for i in range(len(clusters)):\n",
127+
" for c in clusters[i]:\n",
128+
" classifications[c] = i\n",
129+
" return classifications\n",
130+
"\n",
131+
"\n",
132+
" def fit(self, X):\n",
133+
" \"\"\"\n",
134+
" Fit to data X with labels y.\n",
135+
" \"\"\"\n",
136+
"\n",
137+
" \"\"\"\n",
138+
" Construct the Q matrix for solving\n",
139+
" \"\"\"\n",
140+
" Q = numpy.zeros((len(data),len(data)))\n",
141+
" for i in range(len(data)):\n",
142+
" for j in range(i,len(data)):\n",
143+
" Qval = 1.\n",
144+
" Qval *= self.kernel(*(\n",
145+
" (data[i,:], data[j,:])\n",
146+
" + self.kargs\n",
147+
" ))\n",
148+
" Q[i,j] = Q[j,i] = Qval\n",
149+
"\n",
150+
"\n",
151+
" \"\"\"\n",
152+
" Solve for a and w simultaneously by coordinate descent.\n",
153+
" This means no quadratic solver is needed!\n",
154+
" The support vectors correspond to non-zero values in a.\n",
155+
" \"\"\"\n",
156+
" self.w = numpy.zeros(X.shape[1])\n",
157+
" self.a = numpy.zeros(X.shape[0])\n",
158+
" delta = 10000000000.0\n",
159+
" while delta > self.tolerance:\n",
160+
" delta = 0.\n",
161+
" for i in range(len(data)):\n",
162+
" g = numpy.dot(Q[i,:], self.a) - Q[i,i]\n",
163+
" adelta = self.a[i] - min(max(self.a[i] - g/Q[i,i], 0.0), self.C)\n",
164+
" self.w += adelta * X[i,:]\n",
165+
" delta += abs(adelta)\n",
166+
" self.a[i] -= adelta\n",
167+
" if self.verbose:\n",
168+
" print(\"Descent step magnitude:\", delta)\n",
169+
"\n",
170+
" #get the data for support vectors\n",
171+
" Qshrunk = Q[self.a >= self.C/100.,:][:,self.a >= self.C/100.]\n",
172+
" self.sv = X[self.a >= self.C/100., :]\n",
173+
" self.a = (self.a)[self.a >= self.C/100.]\n",
174+
"\n",
175+
" #Do an all-pairs contour check\n",
176+
"\n",
177+
" #calculate the contribution of all SVs\n",
178+
" for i in range(len(self.a)):\n",
179+
" for j in range(len(self.a)):\n",
180+
" Qshrunk[i,j] *= self.a[i]*self.a[j]\n",
181+
"\n",
182+
" #this is needed for radius calculation apparently\n",
183+
" self.bOffset = numpy.sum(numpy.sum(Qshrunk))\n",
184+
" if self.verbose:\n",
185+
" print(\"Number of support vectors:\", len(self.a))\n",
186+
"\n",
187+
" \"\"\"\n",
188+
" Select support vectors and solve for b to get the final classifier\n",
189+
" \"\"\"\n",
190+
" self.b = numpy.mean(self._predict(self.sv))\n",
191+
"\n",
192+
"\n",
193+
" if self.verbose:\n",
194+
" print(\"Bias value:\", self.b)\n",
195+
"\n",
196+
" def _predict(self, X):\n",
197+
" \"\"\"\n",
198+
" For SVClustering, we need to calculate radius rather than bias.\n",
199+
" \"\"\"\n",
200+
" if (len(X.shape) < 2):\n",
201+
" X = X.reshape((1,-1))\n",
202+
" clss = numpy.zeros(len(X))\n",
203+
" for i in range(len(X)):\n",
204+
" clss[i] += self.kernel(* ((X[i,:],X[i,:]) + self.kargs))\n",
205+
" for j in range(len(self.sv)):\n",
206+
" clss[i] -= 2 * self.a[j] * self.kernel(* ((self.sv[j,:],X[i,:]) + self.kargs))\n",
207+
" return (clss+self.bOffset)**0.5\n",
208+
"\n",
209+
" def predict(self, X):\n",
210+
" \"\"\"\n",
211+
" Predict classes for data X.\n",
212+
" NOTE: this should really be done with either the fitting data or a superset of the fitting data.\n",
213+
" \"\"\"\n",
214+
"\n",
215+
" return self._getAllClasses(X)"
216+
]
217+
},
218+
{
219+
"cell_type": "code",
220+
"execution_count": null,
221+
"metadata": {},
222+
"outputs": [],
223+
"source": [
224+
"data,labels = sklearn.datasets.make_moons(400,noise=0.01,random_state=0)\n",
225+
"data -= numpy.mean(data,axis=0)\n",
226+
"\n",
227+
"#parameters can be sensitive, these ones work for two moons\n",
228+
"C = 0.1\n",
229+
"clss = SimpleSVClustering(C,1e-10,rbfKernel,(3.5,))\n",
230+
"clss.fit(data)\n",
231+
"\n",
232+
"#check assigned classes for the two moons as a classification error\n",
233+
"t = clss.predict(data)\n",
234+
"print(\"Error\", numpy.sum((labels-t)**2) / float(len(data)))\n"
235+
]
236+
},
237+
{
238+
"cell_type": "code",
239+
"execution_count": null,
240+
"metadata": {
241+
"collapsed": true
242+
},
243+
"outputs": [],
244+
"source": [
245+
"#generate a heatmap and display classified clusters.\n",
246+
"a = numpy.zeros((100,100))\n",
247+
"for i in range(100):\n",
248+
" for j in range(100):\n",
249+
" a[j,i] = clss._predict(numpy.array([i*4/100.-2,j*4/100.-2]))\n",
250+
"pyplot.imshow(a, cmap='hot', interpolation='nearest')\n",
251+
"data *= 25.\n",
252+
"data += 50.\n",
253+
"pyplot.scatter(data[t==0,0],data[t==0,1],c='r')\n",
254+
"pyplot.scatter(data[t==1,0],data[t==1,1],c='b')\n",
255+
"\n",
256+
"pyplot.show()"
257+
]
258+
},
259+
{
260+
"cell_type": "code",
261+
"execution_count": null,
262+
"metadata": {
263+
"collapsed": true
264+
},
265+
"outputs": [],
266+
"source": []
267+
}
268+
],
269+
"metadata": {
270+
"kernelspec": {
271+
"display_name": "Python [default]",
272+
"language": "python",
273+
"name": "python3"
274+
},
275+
"language_info": {
276+
"codemirror_mode": {
277+
"name": "ipython",
278+
"version": 3
279+
},
280+
"file_extension": ".py",
281+
"mimetype": "text/x-python",
282+
"name": "python",
283+
"nbconvert_exporter": "python",
284+
"pygments_lexer": "ipython3",
285+
"version": "3.5.3"
286+
}
287+
},
288+
"nbformat": 4,
289+
"nbformat_minor": 2
290+
}

0 commit comments

Comments
 (0)