Skip to content

Commit 2bc559f

Browse files
committed
first version ada boost
1 parent 561ec31 commit 2bc559f

File tree

4 files changed

+291
-2
lines changed

4 files changed

+291
-2
lines changed

.idea/workspace.xml

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

AdaBoost/adaboost.py

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# encoding=utf-8
2+
# @Author: wendesi
3+
# @Date: 15-11-16
4+
5+
# @Last modified by: wendesi
6+
# @Last modified time: 15-11-16
7+
8+
import math
9+
10+
class Sign(object):
11+
def __init__(self,features,labels,w):
12+
self.X = features
13+
self.Y = labels
14+
self.N = len(labels)
15+
16+
self.w = w
17+
18+
def _train_less_than_(self):
19+
index = -1
20+
error_score = 1000000
21+
22+
for i in xrange(self.N+1):
23+
score = 0
24+
for j in xrange(self.N):
25+
val = -1
26+
if j<i:
27+
val = 1
28+
29+
if val*self.Y[j]<0:
30+
score += self.w[j]
31+
32+
if score < error_score:
33+
index = i
34+
error_score = score
35+
36+
return index,error_score
37+
38+
39+
40+
def _train_more_than_(self):
41+
index = -1
42+
error_score = 1000000
43+
44+
for i in xrange(self.N+1):
45+
score = 0
46+
for j in xrange(self.N):
47+
val = 1
48+
if j<i:
49+
val = -1
50+
51+
if val*self.Y[j]<0:
52+
score += self.w[j]
53+
54+
if score < error_score:
55+
index = i
56+
error_score = score
57+
58+
return index,error_score
59+
60+
def train(self):
61+
less_index,less_score = self._train_less_than_()
62+
more_index,more_score = self._train_more_than_()
63+
64+
if less_score < more_score:
65+
self.is_less = True
66+
self.index = less_index
67+
return less_score
68+
69+
else:
70+
self.is_less = False
71+
self.index = more_index
72+
return more_score
73+
74+
def predict(self,feature):
75+
if self.is_less:
76+
if feature<self.index:
77+
return 1.0
78+
else:
79+
return -1.0
80+
else:
81+
if feature<self.index:
82+
return -1.0
83+
else:
84+
return 1.0
85+
86+
87+
class AdaBoost(object):
88+
89+
def __init__(self):
90+
pass
91+
92+
def _init_parameters_(self,features,labels):
93+
self.X = features
94+
self.Y = labels
95+
96+
self.n = len(features[0])
97+
self.N = len(features)
98+
self.M = 100 # 分类器数目
99+
100+
self.w = [1.0/self.N]*self.N
101+
self.alpha = []
102+
self.classifier = []
103+
104+
def _w_(self,index,classifier,i):
105+
return self.w[i]*math.exp(-self.alpha[-1]*self.Y[i]*classifier.predict(self.X[i][index]))
106+
107+
def _Z_(self,index,classifier):
108+
Z = 0
109+
110+
for i in xrange(self.N):
111+
Z += self._w_(index,classifier,i)
112+
113+
return Z
114+
115+
def train(self,features,labels):
116+
117+
self._init_parameters_(features,labels)
118+
119+
for times in xrange(self.M):
120+
121+
best_classifier = (100000,None,None) #(误差率,分类器,针对的特征)
122+
for i in xrange(self.n):
123+
features = map(lambda x:x[i],self.X)
124+
classifier = Sign(features,self.Y,self.w)
125+
error_score = classifier.train()
126+
127+
if error_score < best_classifier[0]:
128+
best_classifier = (error_score,classifier,i)
129+
130+
em = best_classifier[0]
131+
self.alpha.append(1/2*math.log((1-em)/em))
132+
self.classifier.append(best_classifier[1:])
133+
134+
Z = self._Z_(best_classifier[1],best_classifier[2])
135+
136+
for i in xrange(self.N):
137+
self.w[i] = self._w_(best_classifier[1],best_classifier[2],i)/Z
138+
139+
if __name__ == '__main__':
140+
features = [[0],[1],[2],[3],[4],[5],[6],[7],[8],[9]]
141+
labels = [1,1,1,-1,-1,-1,1,1,1,-1]
142+
143+
144+
145+
146+
ada = AdaBoost()
147+
ada.train(features,labels)

AdaBoost/adaboost_fakedata.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# encoding=utf-8
2+
# @Author: wendesi
3+
# @Date: 15-11-16
4+
5+
# @Last modified by: wendesi
6+
# @Last modified time: 15-11-16

AdaBoost/generate_dataset.py

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# encoding=utf8
2+
import numpy as np
3+
import random
4+
import matplotlib
5+
import matplotlib.pyplot as plt
6+
7+
N = 10 #生成训练数据的个数
8+
9+
# AX=0 相当于matlab中 null(a','r')
10+
def null(a, rtol=1e-5):
11+
u, s, v = np.linalg.svd(a)
12+
rank = (s > rtol*s[0]).sum()
13+
return rank, v[rank:].T.copy()
14+
15+
# 符号函数,之后要进行向量化
16+
def sign(x):
17+
if x > 0:
18+
return 1
19+
elif x == 0:
20+
return 0
21+
elif x < 0:
22+
return -1
23+
#noisy=False,那么就会生成N的dim维的线性可分数据X,标签为y
24+
#noisy=True, 那么生成的数据是线性不可分的,标签为y
25+
def mk_data(N, noisy=False):
26+
rang = [-10,10]
27+
dim = 2
28+
29+
X=np.random.rand(dim,N)*(rang[1]-rang[0])+rang[0]
30+
31+
while True:
32+
Xsample = np.concatenate((np.ones((1,dim)), np.random.rand(dim,dim)*(rang[1]-rang[0])+rang[0]))
33+
k,w=null(Xsample.T)
34+
y = sign(np.dot(w.T,np.concatenate((np.ones((1,N)), X))))
35+
if np.all(y):
36+
break
37+
38+
if noisy == True:
39+
idx = random.sample(range(1,N), N/10)
40+
41+
for id in idx:
42+
y[0][id] = -y[0][id]
43+
44+
return (X,y,w)
45+
46+
def data_visualization(X,y,title):
47+
class_1 = [[],[]]
48+
class_2 = [[],[]]
49+
50+
size = len(y)
51+
52+
for i in xrange(size):
53+
X_1 = X[0][i]
54+
X_2 = X[1][i]
55+
56+
if y[i] == 1:
57+
class_1[0].append(X_1)
58+
class_1[1].append(X_2)
59+
else:
60+
class_2[0].append(X_1)
61+
class_2[1].append(X_2)
62+
63+
64+
plt.figure(figsize=(8, 6), dpi=80)
65+
plt.title(title)
66+
67+
axes = plt.subplot(111)
68+
69+
type1 = axes.scatter(class_1[0], class_1[1], s=20, c='red')
70+
type2 = axes.scatter(class_2[0], class_2[1], s=20, c='green')
71+
72+
73+
plt.show()
74+
75+
def rebuild_features(features):
76+
size = len(features[0])
77+
78+
new_features = []
79+
for i in xrange(size):
80+
new_features.append([features[0][i],features[1][i]])
81+
82+
return new_features
83+
84+
def generate_dataset(size, noisy = False, visualization = True):
85+
global sign
86+
sign = np.vectorize(sign)
87+
X,y,w = mk_data(size,False)
88+
y = list(y[0])
89+
90+
if visualization:
91+
data_visualization(X,y,'all data') #数据可视化
92+
93+
testset_size = int(len(y)*0.333)
94+
95+
indexes = [i for i in xrange(len(y))]
96+
test_indexes = random.sample(indexes,testset_size)
97+
train_indexes = list(set(indexes)-set(test_indexes))
98+
99+
trainset_features = [[],[]]
100+
trainset_labels = []
101+
102+
testset_features = [[],[]]
103+
testset_labels = []
104+
105+
for i in test_indexes:
106+
testset_features[0].append(X[0][i])
107+
testset_features[1].append(X[1][i])
108+
testset_labels.append(y[i])
109+
110+
111+
if visualization:
112+
data_visualization(testset_features,testset_labels,'test set')
113+
114+
for i in train_indexes:
115+
trainset_features[0].append(X[0][i])
116+
trainset_features[1].append(X[1][i])
117+
trainset_labels.append(y[i])
118+
119+
if visualization:
120+
data_visualization(trainset_features,trainset_labels,'train set')
121+
122+
return rebuild_features(trainset_features),trainset_labels,rebuild_features(testset_features),testset_labels
123+
124+
125+
126+
if __name__ == '__main__':
127+
128+
size = 1000
129+
generate_dataset(size)
130+
131+
# generate_dataset
132+
# print sign
133+
# sign = np.vectorize(sign)
134+
# X,y,w = mk_data(size,False)
135+
#
136+
# data_visualization(X,y)

0 commit comments

Comments
 (0)