Browse Source

ada_boost算法实现

yufeng0528 4 years ago
parent
commit
b0c03a5a80
3 changed files with 148 additions and 38 deletions
  1. 86 0
      integr/my_ada_boost.py
  2. 0 0
      tree/__init__.py
  3. 62 38
      tree/my_tree.py

+ 86 - 0
integr/my_ada_boost.py

@@ -0,0 +1,86 @@
1
+# -*- encoding:utf-8 -*-
2
+from sklearn.datasets import load_wine, load_iris, load_breast_cancer
3
+from sklearn.model_selection import train_test_split
4
+from sklearn.tree import DecisionTreeClassifier
5
+import numpy as np
6
+from tree import my_tree
7
+
8
+wine = load_breast_cancer()
9
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
10
+
11
+# 设置数据初始权重
12
+w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
13
+# 决策树权重
14
+alpha = []
15
+# 决策树数组
16
+trees = []
17
+pn = 1/len(Ytrain)
18
+
19
+for i in range(len(Ytrain)):
20
+    if Ytrain[i] == 0:
21
+        Ytrain[i] = -1
22
+for i in range(len(Ytest)):
23
+    if Ytest[i] == 0:
24
+        Ytest[i] = -1
25
+print(Ytest)
26
+
27
+for i in range(30):
28
+    # 训练决策树
29
+    # clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1,
30
+    #                              class_weight={0:w0, 1:1-w0})  # 实例化,criterion不写的话默认是基尼系数
31
+    nodes = my_tree.fit(Xtrain, Ytrain, None, 0, w)
32
+
33
+    # my_tree.print_width([nodes], 1)
34
+    # print("熵值", my_tree.calc_ent(Ytrain, w))
35
+    Xpredit = my_tree.predict(Xtrain, Ytrain, nodes)
36
+    error = 0
37
+    p_error = 0
38
+    for j in range(len(Ytrain)):
39
+        if Xpredit[j] != Ytrain[j]:
40
+            error += w[j]
41
+            p_error += pn
42
+
43
+    if error > 0.5:
44
+        continue
45
+    if error == 0:
46
+        error = 0.001
47
+
48
+    print("第", i, "轮错误率", p_error, error)
49
+    ab = 0.5*np.log2(1/error - 1)
50
+
51
+    # 更新权重
52
+    for j in range(len(Ytrain)):
53
+        w[j] = w[j]*np.exp(-ab*Ytrain[j]*Xpredit[j])
54
+    sum_w = sum(w)
55
+    w = w/sum_w
56
+
57
+    alpha.append(ab)
58
+    trees.append(nodes)
59
+
60
+predicts = []
61
+for tree in trees:
62
+    predicts.append(my_tree.predict(Xtest, None, tree))
63
+
64
+print(alpha)
65
+# 结果加权
66
+result = np.zeros(len(Xtest), float)
67
+for p in predicts:
68
+    r = 0
69
+    for w_alpha in alpha:
70
+        r += w_alpha * p
71
+    result = result + r
72
+
73
+print("sign前:" , result)
74
+result = np.sign(result)
75
+print("sign后:", result)
76
+
77
+# print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
78
+# print(result == Ytest)
79
+print(len([i for i in result == Ytest if i])/len(result))
80
+# cmp = np.concatenate(([result], [Ytest]), axis=0)
81
+# print(cmp)
82
+
83
+
84
+clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1)
85
+clf = clf.fit(Xtrain, Ytrain)
86
+print(clf.score(Xtest, Ytest))

+ 0 - 0
tree/__init__.py


+ 62 - 38
tree/my_tree.py

@@ -5,7 +5,8 @@ from sklearn.model_selection import train_test_split
5 5
 import numpy as np
6 6
 
7 7
 feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
8
-                '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸']
8
+                '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸'
9
+                , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
9 10
 class_names=["琴酒", "雪莉", "贝尔摩德"]
10 11
 
11 12
 # 生成决策树的节点类型
@@ -31,9 +32,9 @@ class TreeNode(object):
31 32
         self.left = left
32 33
         self.right = right
33 34
 
34
-        if self.y == -1:
35
-            self.y = np.where(value == np.max(value))[0][0]
36
-            print(self.y, self.value)
35
+        # if self.y == -1:
36
+        #     self.y = np.where(value == np.max(value))[0][0]
37
+            # print(self.y, self.value)
37 38
 
38 39
     def __str__(self):
39 40
         if self.idx == -1:
@@ -57,14 +58,18 @@ def read_data():
57 58
     return Xtrain, Xtest, Ytrain, Ytest
58 59
 
59 60
 
60
-def calc_ent(x):
61
+def calc_ent(x, weights=None):
61 62
     """
62 63
         calculate shanno ent of x
63 64
     """
64 65
     x_value_list = set([x[i] for i in range(x.shape[0])])
65 66
     ent = 0.0
66 67
     for x_value in x_value_list:
67
-        p = float(x[x == x_value].shape[0]) / x.shape[0]
68
+        if weights is None:
69
+            p = float(x[x == x_value].shape[0]) / x.shape[0]
70
+        else:
71
+            weights = weights/sum(weights)
72
+            p = sum(sum([x == x_value]*weights))
68 73
         logp = np.log2(p)
69 74
         ent -= p * logp
70 75
 
@@ -108,31 +113,33 @@ def calc_ent1(x):
108 113
 
109 114
 
110 115
 # 计算某个属性的信息增益
111
-def cal_ent_attr(Xtrain, Ytrain):
112
-    print('sharp', Xtrain.shape)
113
-
116
+def cal_ent_attr(Xtrain, Ytrain, weights):
117
+    # print('sharp', Xtrain.shape)
118
+    weights = weights / sum(weights)
114 119
     # 对每个属性
115 120
     min_ent = 100
116 121
     min_i = 0
117 122
     min_mean = 0
118
-    for i in range(Xtrain.shape[1]):
123
+
124
+    for i in np.random.randint(0,Xtrain.shape[1],size=(15)):
119 125
         x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
120 126
         mean = sum(x_value_list)/len(x_value_list)
121 127
         sum_ent = 0
122 128
         # 二叉树
123 129
         p = Ytrain[Xtrain[:, i] > mean]
124
-        sum_ent = sum_ent + calc_ent(p)*len(p)/len(Ytrain)
130
+        p0 = sum(weights[Xtrain[:, i] > mean])
131
+        sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] > mean])*p0
125 132
         p = Ytrain[Xtrain[:, i] <= mean]
126
-        sum_ent = sum_ent + calc_ent(p)*len(p)/len(Ytrain)
133
+        sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] <= mean])*(1-p0)
127 134
 
128
-        if sum_ent < min_ent:
135
+        if sum_ent <= min_ent:
129 136
             min_ent = sum_ent
130 137
             min_i = i
131 138
             min_mean = mean
132 139
     return min_i, min_mean, min_ent
133 140
 
134 141
 
135
-def cal_max_ent_attr_c45(Xtrain, Ytrain):
142
+def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
136 143
     max_ent = 0
137 144
     max_mean = 0
138 145
     h = calc_ent(Ytrain)
@@ -140,12 +147,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain):
140 147
         left = Xtrain[:k + 1]
141 148
         right = Xtrain[k + 1:]
142 149
 
143
-        left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
144
-        right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
150
+        if weights is None:
151
+            left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
152
+            right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
153
+
154
+        else:
155
+            pass
145 156
 
146 157
         iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
147 158
         iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
148 159
 
160
+
149 161
         gain_ent = (h - left_ent - right_ent)/iv
150 162
 
151 163
         if gain_ent > max_ent:
@@ -153,6 +165,8 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain):
153 165
             max_mean = left[-1]
154 166
     return  max_ent, max_mean
155 167
 
168
+# 样本权重
169
+weights = []
156 170
 
157 171
 # 计算某个属性的信息增益率
158 172
 def cal_ent_attr_c45(Xtrain, Ytrain):
@@ -174,7 +188,7 @@ def cal_ent_attr_c45(Xtrain, Ytrain):
174 188
 
175 189
 # 计算某个属性的基尼指数
176 190
 def cal_gini_attr(Xtrain, Ytrain):
177
-    print('sharp', Xtrain.shape)
191
+    # print('sharp', Xtrain.shape)
178 192
 
179 193
     # 对每个属性
180 194
     min_ent = 100
@@ -196,7 +210,7 @@ def cal_gini_attr(Xtrain, Ytrain):
196 210
             min_mean = mean
197 211
     return min_i, min_mean, min_ent
198 212
 
199
-MAX_T = 5
213
+MAX_T = 1
200 214
 
201 215
 
202 216
 def is_end(Ytrain):
@@ -206,22 +220,24 @@ def is_end(Ytrain):
206 220
         return True
207 221
 
208 222
 # 强行划分为叶子节点
209
-def leaf_node(Ytrain):
223
+def leaf_node(Ytrain, weights):
210 224
     p_set = []
225
+    k = 0
211 226
     for item in Ytrain:
212 227
         for i in p_set:
213 228
             if i[0] == item:
214
-                i[1] = i[1] + 1
229
+                i[1] = i[1] + weights[k]
215 230
                 break
216 231
         else:
217
-            i = [item, 1]
232
+            i = [item, weights[k]]
218 233
             p_set.append(i)
234
+        k = k + 1
219 235
 
220 236
     max_item = [0, 0]
221 237
     for item in p_set:
222 238
         if item[1] > max_item[1]:
223 239
             max_item = item
224
-    print('这个是叶子节点,value:', max_item[0])
240
+    # print('这个是叶子节点,value:', max_item[0])
225 241
     return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
226 242
 
227 243
 
@@ -235,18 +251,18 @@ def distrib(Ytrain):
235 251
     return d_list
236 252
 
237 253
 
238
-def fit(Xtrain, Ytrain, parent_node, depth):
254
+def fit(Xtrain, Ytrain, parent_node, depth, weights):
239 255
 
240 256
     if is_end(Ytrain):
241
-        print('这个是叶子节点')
257
+        # print('这个是叶子节点')
242 258
         return TreeNode(-1, 0, 0, True, -1, len(Ytrain), distrib(Ytrain))
243 259
 
244 260
     if depth >= MAX_T:
245
-        return leaf_node(Ytrain)
261
+        return leaf_node(Ytrain, weights)
246 262
 
247
-    i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
263
+    i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
248 264
     total_ent = calc_ent(Ytrain)
249
-    print("第", i, "个属性,mean:", mean)
265
+    # print("第", i, "个属性,mean:", mean)
250 266
     # 生成节点
251 267
     parent_node = TreeNode(i, mean, total_ent - min_ent, False, -1, len(Ytrain), distrib(Ytrain))
252 268
 
@@ -255,12 +271,12 @@ def fit(Xtrain, Ytrain, parent_node, depth):
255 271
     right_Xtrain = Xtrain[Xtrain[:, i] > mean]
256 272
     # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
257 273
 
258
-    right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1)
274
+    right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[Xtrain[:, i] > mean])
259 275
 
260 276
     left_Ytrain = Ytrain[Xtrain[:, i] <= mean]
261 277
     left_Xtrain = Xtrain[Xtrain[:, i] <= mean]
262 278
     # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
263
-    left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1)
279
+    left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[Xtrain[:, i] <= mean])
264 280
 
265 281
     parent_node.left = left_node
266 282
     parent_node.right = right_node
@@ -286,29 +302,37 @@ def print_width(nodes, depth):
286 302
 
287 303
 def predit_one(X, Y, node):
288 304
     if node.is_leaf:
289
-        print(class_names[node.y], class_names[Y])
305
+        # print(class_names[node.y], class_names[Y])
306
+        if node.y == 0:
307
+            return -1
308
+        return node.y
290 309
     else:
291 310
         if X[node.idx] <= node.idx_value:
292
-            predit_one(X,Y,node.left)
311
+            return predit_one(X,Y,node.left)
293 312
         else:
294
-            predit_one(X, Y, node.right)
313
+            return predit_one(X, Y, node.right)
295 314
 
296 315
 
297
-def predit(Xtest, Ytest, node):
298
-    for i in range(Xtest.shape[1]):
299
-        predit_one(Xtest[i], Ytest[i], node)
316
+def predict(Xtest, Ytest, node):
317
+    result = []
318
+    for i in range(Xtest.shape[0]):
319
+        result.append(predit_one(Xtest[i], None, node))
320
+    return np.array(result)
300 321
 
301 322
 
302 323
 if __name__ == '__main__':
303 324
     Xtrain, Xtest, Ytrain, Ytest = read_data()
304 325
     print(calc_ent1(Ytrain))
305
-    print(calc_ent(Ytrain))
326
+
327
+    weights = np.ones(len(Ytrain))/Ytrain.shape[0]
328
+    print("熵值", calc_ent(Ytrain))
329
+    print("熵值", calc_ent(Ytrain, weights))
306 330
 
307 331
     print("基尼指数", cal_gini(Ytrain))
308 332
 
309 333
     print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
310 334
 
311
-    node = fit(Xtrain, Ytrain, None, 0)
335
+    node = fit(Xtrain, Ytrain, None, 0, weights)
312 336
     print_width([node], 1)
313 337
 
314
-    predit(Xtest, Ytest, node)
338
+    print(predict(Xtest, Ytest, node))