Browse Source

ada_boost算法实现

yufeng0528 4 years ago
parent
commit
b0c03a5a80
3 changed files with 148 additions and 38 deletions
  1. 86 0
      integr/my_ada_boost.py
  2. 0 0
      tree/__init__.py
  3. 62 38
      tree/my_tree.py

+ 86 - 0
integr/my_ada_boost.py

@@ -0,0 +1,86 @@
1
+# -*- encoding:utf-8 -*-
2
+from sklearn.datasets import load_wine, load_iris, load_breast_cancer
3
+from sklearn.model_selection import train_test_split
4
+from sklearn.tree import DecisionTreeClassifier
5
+import numpy as np
6
+from tree import my_tree
7
+
8
+wine = load_breast_cancer()
9
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
10
+
11
+# 设置数据初始权重
12
+w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
13
+# 决策树权重
14
+alpha = []
15
+# 决策树数组
16
+trees = []
17
+pn = 1/len(Ytrain)
18
+
19
+for i in range(len(Ytrain)):
20
+    if Ytrain[i] == 0:
21
+        Ytrain[i] = -1
22
+for i in range(len(Ytest)):
23
+    if Ytest[i] == 0:
24
+        Ytest[i] = -1
25
+print(Ytest)
26
+
27
+for i in range(30):
28
+    # 训练决策树
29
+    # clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1,
30
+    #                              class_weight={0:w0, 1:1-w0})  # 实例化,criterion不写的话默认是基尼系数
31
+    nodes = my_tree.fit(Xtrain, Ytrain, None, 0, w)
32
+
33
+    # my_tree.print_width([nodes], 1)
34
+    # print("熵值", my_tree.calc_ent(Ytrain, w))
35
+    Xpredit = my_tree.predict(Xtrain, Ytrain, nodes)
36
+    error = 0
37
+    p_error = 0
38
+    for j in range(len(Ytrain)):
39
+        if Xpredit[j] != Ytrain[j]:
40
+            error += w[j]
41
+            p_error += pn
42
+
43
+    if error > 0.5:
44
+        continue
45
+    if error == 0:
46
+        error = 0.001
47
+
48
+    print("第", i, "轮错误率", p_error, error)
49
+    ab = 0.5*np.log2(1/error - 1)
50
+
51
+    # 更新权重
52
+    for j in range(len(Ytrain)):
53
+        w[j] = w[j]*np.exp(-ab*Ytrain[j]*Xpredit[j])
54
+    sum_w = sum(w)
55
+    w = w/sum_w
56
+
57
+    alpha.append(ab)
58
+    trees.append(nodes)
59
+
60
+predicts = []
61
+for tree in trees:
62
+    predicts.append(my_tree.predict(Xtest, None, tree))
63
+
64
+print(alpha)
65
+# 结果加权
66
+result = np.zeros(len(Xtest), float)
67
+for p in predicts:
68
+    r = 0
69
+    for w_alpha in alpha:
70
+        r += w_alpha * p
71
+    result = result + r
72
+
73
+print("sign前:" , result)
74
+result = np.sign(result)
75
+print("sign后:", result)
76
+
77
+# print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
78
+# print(result == Ytest)
79
+print(len([i for i in result == Ytest if i])/len(result))
80
+# cmp = np.concatenate(([result], [Ytest]), axis=0)
81
+# print(cmp)
82
+
83
+
84
+clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1)
85
+clf = clf.fit(Xtrain, Ytrain)
86
+print(clf.score(Xtest, Ytest))

+ 0 - 0
tree/__init__.py


+ 62 - 38
tree/my_tree.py

@@ -5,7 +5,8 @@ from sklearn.model_selection import train_test_split
5
 import numpy as np
5
 import numpy as np
6
 
6
 
7
 feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
7
 feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
8
-                '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸']
8
+                '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸'
9
+                , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
9
 class_names=["琴酒", "雪莉", "贝尔摩德"]
10
 class_names=["琴酒", "雪莉", "贝尔摩德"]
10
 
11
 
11
 # 生成决策树的节点类型
12
 # 生成决策树的节点类型
@@ -31,9 +32,9 @@ class TreeNode(object):
31
         self.left = left
32
         self.left = left
32
         self.right = right
33
         self.right = right
33
 
34
 
34
-        if self.y == -1:
35
-            self.y = np.where(value == np.max(value))[0][0]
36
-            print(self.y, self.value)
35
+        # if self.y == -1:
36
+        #     self.y = np.where(value == np.max(value))[0][0]
37
+            # print(self.y, self.value)
37
 
38
 
38
     def __str__(self):
39
     def __str__(self):
39
         if self.idx == -1:
40
         if self.idx == -1:
@@ -57,14 +58,18 @@ def read_data():
57
     return Xtrain, Xtest, Ytrain, Ytest
58
     return Xtrain, Xtest, Ytrain, Ytest
58
 
59
 
59
 
60
 
60
-def calc_ent(x):
61
+def calc_ent(x, weights=None):
61
     """
62
     """
62
         calculate shanno ent of x
63
         calculate shanno ent of x
63
     """
64
     """
64
     x_value_list = set([x[i] for i in range(x.shape[0])])
65
     x_value_list = set([x[i] for i in range(x.shape[0])])
65
     ent = 0.0
66
     ent = 0.0
66
     for x_value in x_value_list:
67
     for x_value in x_value_list:
67
-        p = float(x[x == x_value].shape[0]) / x.shape[0]
68
+        if weights is None:
69
+            p = float(x[x == x_value].shape[0]) / x.shape[0]
70
+        else:
71
+            weights = weights/sum(weights)
72
+            p = sum(sum([x == x_value]*weights))
68
         logp = np.log2(p)
73
         logp = np.log2(p)
69
         ent -= p * logp
74
         ent -= p * logp
70
 
75
 
@@ -108,31 +113,33 @@ def calc_ent1(x):
108
 
113
 
109
 
114
 
110
 # 计算某个属性的信息增益
115
 # 计算某个属性的信息增益
111
-def cal_ent_attr(Xtrain, Ytrain):
112
-    print('sharp', Xtrain.shape)
113
-
116
+def cal_ent_attr(Xtrain, Ytrain, weights):
117
+    # print('sharp', Xtrain.shape)
118
+    weights = weights / sum(weights)
114
     # 对每个属性
119
     # 对每个属性
115
     min_ent = 100
120
     min_ent = 100
116
     min_i = 0
121
     min_i = 0
117
     min_mean = 0
122
     min_mean = 0
118
-    for i in range(Xtrain.shape[1]):
123
+
124
+    for i in np.random.randint(0,Xtrain.shape[1],size=(15)):
119
         x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
125
         x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
120
         mean = sum(x_value_list)/len(x_value_list)
126
         mean = sum(x_value_list)/len(x_value_list)
121
         sum_ent = 0
127
         sum_ent = 0
122
         # 二叉树
128
         # 二叉树
123
         p = Ytrain[Xtrain[:, i] > mean]
129
         p = Ytrain[Xtrain[:, i] > mean]
124
-        sum_ent = sum_ent + calc_ent(p)*len(p)/len(Ytrain)
130
+        p0 = sum(weights[Xtrain[:, i] > mean])
131
+        sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] > mean])*p0
125
         p = Ytrain[Xtrain[:, i] <= mean]
132
         p = Ytrain[Xtrain[:, i] <= mean]
126
-        sum_ent = sum_ent + calc_ent(p)*len(p)/len(Ytrain)
133
+        sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] <= mean])*(1-p0)
127
 
134
 
128
-        if sum_ent < min_ent:
135
+        if sum_ent <= min_ent:
129
             min_ent = sum_ent
136
             min_ent = sum_ent
130
             min_i = i
137
             min_i = i
131
             min_mean = mean
138
             min_mean = mean
132
     return min_i, min_mean, min_ent
139
     return min_i, min_mean, min_ent
133
 
140
 
134
 
141
 
135
-def cal_max_ent_attr_c45(Xtrain, Ytrain):
142
+def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
136
     max_ent = 0
143
     max_ent = 0
137
     max_mean = 0
144
     max_mean = 0
138
     h = calc_ent(Ytrain)
145
     h = calc_ent(Ytrain)
@@ -140,12 +147,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain):
140
         left = Xtrain[:k + 1]
147
         left = Xtrain[:k + 1]
141
         right = Xtrain[k + 1:]
148
         right = Xtrain[k + 1:]
142
 
149
 
143
-        left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
144
-        right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
150
+        if weights is None:
151
+            left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
152
+            right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
153
+
154
+        else:
155
+            pass
145
 
156
 
146
         iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
157
         iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
147
         iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
158
         iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
148
 
159
 
160
+
149
         gain_ent = (h - left_ent - right_ent)/iv
161
         gain_ent = (h - left_ent - right_ent)/iv
150
 
162
 
151
         if gain_ent > max_ent:
163
         if gain_ent > max_ent:
@@ -153,6 +165,8 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain):
153
             max_mean = left[-1]
165
             max_mean = left[-1]
154
     return  max_ent, max_mean
166
     return  max_ent, max_mean
155
 
167
 
168
+# 样本权重
169
+weights = []
156
 
170
 
157
 # 计算某个属性的信息增益率
171
 # 计算某个属性的信息增益率
158
 def cal_ent_attr_c45(Xtrain, Ytrain):
172
 def cal_ent_attr_c45(Xtrain, Ytrain):
@@ -174,7 +188,7 @@ def cal_ent_attr_c45(Xtrain, Ytrain):
174
 
188
 
175
 # 计算某个属性的基尼指数
189
 # 计算某个属性的基尼指数
176
 def cal_gini_attr(Xtrain, Ytrain):
190
 def cal_gini_attr(Xtrain, Ytrain):
177
-    print('sharp', Xtrain.shape)
191
+    # print('sharp', Xtrain.shape)
178
 
192
 
179
     # 对每个属性
193
     # 对每个属性
180
     min_ent = 100
194
     min_ent = 100
@@ -196,7 +210,7 @@ def cal_gini_attr(Xtrain, Ytrain):
196
             min_mean = mean
210
             min_mean = mean
197
     return min_i, min_mean, min_ent
211
     return min_i, min_mean, min_ent
198
 
212
 
199
-MAX_T = 5
213
+MAX_T = 1
200
 
214
 
201
 
215
 
202
 def is_end(Ytrain):
216
 def is_end(Ytrain):
@@ -206,22 +220,24 @@ def is_end(Ytrain):
206
         return True
220
         return True
207
 
221
 
208
 # 强行划分为叶子节点
222
 # 强行划分为叶子节点
209
-def leaf_node(Ytrain):
223
+def leaf_node(Ytrain, weights):
210
     p_set = []
224
     p_set = []
225
+    k = 0
211
     for item in Ytrain:
226
     for item in Ytrain:
212
         for i in p_set:
227
         for i in p_set:
213
             if i[0] == item:
228
             if i[0] == item:
214
-                i[1] = i[1] + 1
229
+                i[1] = i[1] + weights[k]
215
                 break
230
                 break
216
         else:
231
         else:
217
-            i = [item, 1]
232
+            i = [item, weights[k]]
218
             p_set.append(i)
233
             p_set.append(i)
234
+        k = k + 1
219
 
235
 
220
     max_item = [0, 0]
236
     max_item = [0, 0]
221
     for item in p_set:
237
     for item in p_set:
222
         if item[1] > max_item[1]:
238
         if item[1] > max_item[1]:
223
             max_item = item
239
             max_item = item
224
-    print('这个是叶子节点,value:', max_item[0])
240
+    # print('这个是叶子节点,value:', max_item[0])
225
     return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
241
     return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
226
 
242
 
227
 
243
 
@@ -235,18 +251,18 @@ def distrib(Ytrain):
235
     return d_list
251
     return d_list
236
 
252
 
237
 
253
 
238
-def fit(Xtrain, Ytrain, parent_node, depth):
254
+def fit(Xtrain, Ytrain, parent_node, depth, weights):
239
 
255
 
240
     if is_end(Ytrain):
256
     if is_end(Ytrain):
241
-        print('这个是叶子节点')
257
+        # print('这个是叶子节点')
242
         return TreeNode(-1, 0, 0, True, -1, len(Ytrain), distrib(Ytrain))
258
         return TreeNode(-1, 0, 0, True, -1, len(Ytrain), distrib(Ytrain))
243
 
259
 
244
     if depth >= MAX_T:
260
     if depth >= MAX_T:
245
-        return leaf_node(Ytrain)
261
+        return leaf_node(Ytrain, weights)
246
 
262
 
247
-    i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
263
+    i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
248
     total_ent = calc_ent(Ytrain)
264
     total_ent = calc_ent(Ytrain)
249
-    print("第", i, "个属性,mean:", mean)
265
+    # print("第", i, "个属性,mean:", mean)
250
     # 生成节点
266
     # 生成节点
251
     parent_node = TreeNode(i, mean, total_ent - min_ent, False, -1, len(Ytrain), distrib(Ytrain))
267
     parent_node = TreeNode(i, mean, total_ent - min_ent, False, -1, len(Ytrain), distrib(Ytrain))
252
 
268
 
@@ -255,12 +271,12 @@ def fit(Xtrain, Ytrain, parent_node, depth):
255
     right_Xtrain = Xtrain[Xtrain[:, i] > mean]
271
     right_Xtrain = Xtrain[Xtrain[:, i] > mean]
256
     # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
272
     # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
257
 
273
 
258
-    right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1)
274
+    right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[Xtrain[:, i] > mean])
259
 
275
 
260
     left_Ytrain = Ytrain[Xtrain[:, i] <= mean]
276
     left_Ytrain = Ytrain[Xtrain[:, i] <= mean]
261
     left_Xtrain = Xtrain[Xtrain[:, i] <= mean]
277
     left_Xtrain = Xtrain[Xtrain[:, i] <= mean]
262
     # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
278
     # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
263
-    left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1)
279
+    left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[Xtrain[:, i] <= mean])
264
 
280
 
265
     parent_node.left = left_node
281
     parent_node.left = left_node
266
     parent_node.right = right_node
282
     parent_node.right = right_node
@@ -286,29 +302,37 @@ def print_width(nodes, depth):
286
 
302
 
287
 def predit_one(X, Y, node):
303
 def predit_one(X, Y, node):
288
     if node.is_leaf:
304
     if node.is_leaf:
289
-        print(class_names[node.y], class_names[Y])
305
+        # print(class_names[node.y], class_names[Y])
306
+        if node.y == 0:
307
+            return -1
308
+        return node.y
290
     else:
309
     else:
291
         if X[node.idx] <= node.idx_value:
310
         if X[node.idx] <= node.idx_value:
292
-            predit_one(X,Y,node.left)
311
+            return predit_one(X,Y,node.left)
293
         else:
312
         else:
294
-            predit_one(X, Y, node.right)
313
+            return predit_one(X, Y, node.right)
295
 
314
 
296
 
315
 
297
-def predit(Xtest, Ytest, node):
298
-    for i in range(Xtest.shape[1]):
299
-        predit_one(Xtest[i], Ytest[i], node)
316
+def predict(Xtest, Ytest, node):
317
+    result = []
318
+    for i in range(Xtest.shape[0]):
319
+        result.append(predit_one(Xtest[i], None, node))
320
+    return np.array(result)
300
 
321
 
301
 
322
 
302
 if __name__ == '__main__':
323
 if __name__ == '__main__':
303
     Xtrain, Xtest, Ytrain, Ytest = read_data()
324
     Xtrain, Xtest, Ytrain, Ytest = read_data()
304
     print(calc_ent1(Ytrain))
325
     print(calc_ent1(Ytrain))
305
-    print(calc_ent(Ytrain))
326
+
327
+    weights = np.ones(len(Ytrain))/Ytrain.shape[0]
328
+    print("熵值", calc_ent(Ytrain))
329
+    print("熵值", calc_ent(Ytrain, weights))
306
 
330
 
307
     print("基尼指数", cal_gini(Ytrain))
331
     print("基尼指数", cal_gini(Ytrain))
308
 
332
 
309
     print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
333
     print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
310
 
334
 
311
-    node = fit(Xtrain, Ytrain, None, 0)
335
+    node = fit(Xtrain, Ytrain, None, 0, weights)
312
     print_width([node], 1)
336
     print_width([node], 1)
313
 
337
 
314
-    predit(Xtest, Ytest, node)
338
+    print(predict(Xtest, Ytest, node))