2
2
3
3
import cv2
4
4
import time
5
- import math
5
+ import logging
6
6
import numpy as np
7
7
import pandas as pd
8
8
9
9
10
10
from sklearn .cross_validation import train_test_split
11
11
from sklearn .metrics import accuracy_score
12
12
13
+
13
14
total_class = 10
14
15
16
+ def log (func ):
17
+ def wrapper (* args , ** kwargs ):
18
+ start_time = time .time ()
19
+ logging .debug ('start %s()' % func .__name__ )
20
+ ret = func (* args , ** kwargs )
21
+
22
+ end_time = time .time ()
23
+ logging .debug ('end %s(), cost %s seconds' % (func .__name__ ,end_time - start_time ))
24
+
25
+ return ret
26
+ return wrapper
27
+
28
+
15
29
# 二值化
16
30
def binaryzation (img ):
17
31
cv_img = img .astype (np .uint8 )
18
32
cv2 .threshold (cv_img ,50 ,1 ,cv2 .cv .CV_THRESH_BINARY_INV ,cv_img )
19
33
return cv_img
20
34
35
+ @log
21
36
def binaryzation_features (trainset ):
22
37
features = []
23
38
@@ -49,8 +64,6 @@ def predict(self,features):
49
64
if self .node_type == 'leaf' :
50
65
return self .Class
51
66
52
- print 'in'
53
-
54
67
tree = self .dict [features [self .feature ]]
55
68
return tree .predict (features )
56
69
@@ -94,34 +107,22 @@ def calc_ent_grap(x,y):
94
107
95
108
return ent_grap
96
109
97
- def train (train_set ,train_label ,features ,epsilon ):
110
+ def recurse_train (train_set ,train_label ,features ,epsilon ):
98
111
global total_class
99
112
100
113
LEAF = 'leaf'
101
114
INTERNAL = 'internal'
102
115
103
-
104
116
# 步骤1——如果train_set中的所有实例都属于同一类Ck
105
- label_dict = [0 for i in xrange (total_class )]
106
- for label in train_label :
107
- label_dict [label ] += 1
108
-
109
- for label , label_count in enumerate (label_dict ):
110
- if label_count == len (train_label ):
111
- tree = Tree (LEAF ,Class = label )
112
- return tree
117
+ label_set = set (train_label )
118
+ if len (label_set ) == 1 :
119
+ return Tree (LEAF ,Class = label_set .pop ())
113
120
114
121
# 步骤2——如果features为空
115
- max_len ,max_class = 0 ,0
116
- for i in xrange (total_class ):
117
- class_i = filter (lambda x :x == i ,train_label )
118
- if len (class_i ) > max_len :
119
- max_class = i
120
- max_len = len (class_i )
122
+ (max_class ,max_len ) = max ([(i ,len (filter (lambda x :x == i ,train_label ))) for i in xrange (total_class )],key = lambda x :x [1 ])
121
123
122
124
if len (features ) == 0 :
123
- tree = Tree (LEAF ,Class = max_class )
124
- return tree
125
+ return Tree (LEAF ,Class = max_class )
125
126
126
127
# 步骤3——计算信息增益
127
128
max_feature = 0
@@ -138,8 +139,7 @@ def train(train_set,train_label,features,epsilon):
138
139
139
140
# 步骤4——小于阈值
140
141
if max_gda < epsilon :
141
- tree = Tree (LEAF ,Class = max_class )
142
- return tree
142
+ return Tree (LEAF ,Class = max_class )
143
143
144
144
# 步骤5——构建非空子集
145
145
sub_features = filter (lambda x :x != max_feature ,features )
@@ -157,11 +157,16 @@ def train(train_set,train_label,features,epsilon):
157
157
sub_train_set = train_set [index ]
158
158
sub_train_label = train_label [index ]
159
159
160
- sub_tree = train (sub_train_set ,sub_train_label ,sub_features ,epsilon )
160
+ sub_tree = recurse_train (sub_train_set ,sub_train_label ,sub_features ,epsilon )
161
161
tree .add_tree (feature_value ,sub_tree )
162
162
163
163
return tree
164
164
165
+ @log
166
+ def train (train_set ,train_label ,features ,epsilon ):
167
+ return recurse_train (train_set ,train_label ,features ,epsilon )
168
+
169
+ @log
165
170
def predict (test_set ,tree ):
166
171
167
172
result = []
@@ -173,61 +178,25 @@ def predict(test_set,tree):
173
178
174
179
175
180
if __name__ == '__main__' :
176
- # classes = [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]
177
- #
178
- # age = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2]
179
- # occupation = [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0]
180
- # house = [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0]
181
- # loan = [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0]
182
- #
183
- # features = []
184
- #
185
- # for i in range(15):
186
- # feature = [age[i],occupation[i],house[i],loan[i]]
187
- # features.append(feature)
188
- #
189
- # trainset = np.array(features)
190
- #
191
- # tree = train(trainset,np.array(classes),[0,1,2,3],0.1)
192
- #
193
- # print type(tree)
194
- # features = [0,0,0,1]
195
- # print tree.predict(np.array(features))
196
-
197
-
198
- print 'Start read data'
199
-
200
- time_1 = time .time ()
181
+ logger = logging .getLogger ()
182
+ logger .setLevel (logging .DEBUG )
201
183
202
184
raw_data = pd .read_csv ('../data/train.csv' ,header = 0 )
203
185
data = raw_data .values
204
186
205
187
imgs = data [0 ::,1 ::]
206
188
labels = data [::,0 ]
207
189
190
+ # 图片二值化
208
191
features = binaryzation_features (imgs )
209
192
210
193
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
211
194
train_features , test_features , train_labels , test_labels = train_test_split (features , labels , test_size = 0.33 , random_state = 23323 )
212
- # print train_features.shape
213
- # print train_features.shape
214
-
215
- time_2 = time .time ()
216
- print 'read data cost ' ,time_2 - time_1 ,' second' ,'\n '
217
195
218
- print 'Start training'
219
- tree = train (train_features ,train_labels ,[i for i in range (784 )],0.2 )
220
- print type (tree )
221
- print 'knn do not need to train'
222
- time_3 = time .time ()
223
- print 'training cost ' ,time_3 - time_2 ,' second' ,'\n '
224
-
225
- print 'Start predicting'
196
+ tree = train (train_features ,train_labels ,[i for i in range (784 )],0.1 )
226
197
test_predict = predict (test_features ,tree )
227
- time_4 = time .time ()
228
- print 'predicting cost ' ,time_4 - time_3 ,' second' ,'\n '
229
-
230
198
score = accuracy_score (test_labels ,test_predict )
199
+
231
200
print "The accruacy socre is " , score
232
201
233
202
0 commit comments