Browse Source

在训练集上看是不是性能更好

yufeng0528 4 years ago
parent
commit
edde77a300
2 changed files with 26 additions and 20 deletions
  1. 8 8
      integr/my_ada_boost.py
  2. 18 12
      tree/my_tree.py

+ 8 - 8
integr/my_ada_boost.py

@@ -24,7 +24,7 @@ for i in range(len(Ytest)):
24 24
         Ytest[i] = -1
25 25
 print(Ytest)
26 26
 
27
-for i in range(3):
27
+for i in range(15):
28 28
     # 训练决策树
29 29
     # clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1,
30 30
     #                              class_weight={0:w0, 1:1-w0})  # 实例化,criterion不写的话默认是基尼系数
@@ -59,11 +59,11 @@ for i in range(3):
59 59
 
60 60
 predicts = []
61 61
 for tree in trees:
62
-    predicts.append(my_tree.predict(Xtest, None, tree))
62
+    predicts.append(my_tree.predict(Xtrain, None, tree))
63 63
 
64 64
 print(alpha)
65 65
 # 结果加权
66
-result = np.zeros(len(Xtest), float)
66
+result = np.zeros(len(Xtrain), float)
67 67
 for p in predicts:
68 68
     r = 0
69 69
     for w_alpha in alpha:
@@ -77,7 +77,7 @@ print("sign后:", result)
77 77
 # print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
78 78
 # print(result == Ytest)
79 79
 print()
80
-x = len([i for i in result == Ytest if i])/len(result)
80
+x = len([i for i in result == Ytrain if i])/len(result)
81 81
 x = x + 0.5*len([i for i in result if i==0])/len(result)
82 82
 print(x)
83 83
 # cmp = np.concatenate(([result], [Ytest]), axis=0)
@@ -88,11 +88,11 @@ print(x)
88 88
 # clf = clf.fit(Xtrain, Ytrain)
89 89
 # print(clf.score(Xtest, Ytest))
90 90
 
91
-w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
91
+w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
92 92
 my_tree_0 = trees[0]
93
-Xpredit = my_tree.predict(Xtest, None, nodes)
93
+Ypredit = my_tree.predict(Xtrain, None, my_tree_0)
94 94
 error = 0
95
-for j in range(len(Xtest)):
96
-    if Xpredit[j] != Ytest[j]:
95
+for j in range(len(Xtrain)):
96
+    if Ypredit[j] != Ytrain[j]:
97 97
         error += w[j]
98 98
 print(1-error)

+ 18 - 12
tree/my_tree.py

@@ -64,11 +64,14 @@ def calc_ent(x, weights=None):
64 64
     """
65 65
     x_value_list = set([x[i] for i in range(x.shape[0])])
66 66
     ent = 0.0
67
+
68
+    if weights is not None:
69
+        weights = weights / sum(weights)
70
+
67 71
     for x_value in x_value_list:
68 72
         if weights is None:
69 73
             p = float(x[x == x_value].shape[0]) / x.shape[0]
70 74
         else:
71
-            weights = weights/sum(weights)
72 75
             p = sum(sum([x == x_value]*weights))
73 76
         logp = np.log2(p)
74 77
         ent -= p * logp
@@ -143,7 +146,8 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
143 146
     max_ent = 0
144 147
     max_mean = 0
145 148
     weights = weights / sum(weights)
146
-    h = calc_ent(Ytrain)
149
+    h = calc_ent(Ytrain, weights)
150
+    p = 0
147 151
     for k in range(len(Xtrain) - 1):
148 152
         left = Xtrain[:k + 1]
149 153
         right = Xtrain[k + 1:]
@@ -154,7 +158,7 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
154 158
             iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
155 159
             iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
156 160
         else:
157
-            p = sum(weights[:k+1])
161
+            p += weights[k]
158 162
             left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
159 163
             right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
160 164
             iv = -p * np.log2(p)
@@ -264,23 +268,25 @@ def fit(Xtrain, Ytrain, parent_node, depth, weights):
264 268
     if depth >= MAX_T:
265 269
         return leaf_node(Ytrain, weights)
266 270
 
267
-    i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
268
-    total_ent = calc_ent(Ytrain)
271
+    i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
272
+    total_ent = 0 # calc_ent(Ytrain)
269 273
     # print("第", i, "个属性,mean:", mean)
270 274
     # 生成节点
271 275
     parent_node = TreeNode(i, mean, total_ent - min_ent, False, -2, len(Ytrain), distrib(Ytrain))
272 276
 
273 277
     # 切分数据
274
-    right_Ytrain = Ytrain[Xtrain[:, i] > mean]
275
-    right_Xtrain = Xtrain[Xtrain[:, i] > mean]
278
+    right_position = Xtrain[:, i] > mean
279
+    right_Ytrain = Ytrain[right_position]
280
+    right_Xtrain = Xtrain[right_position]
276 281
     # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
277 282
 
278
-    right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[Xtrain[:, i] > mean])
283
+    right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[right_position])
279 284
 
280
-    left_Ytrain = Ytrain[Xtrain[:, i] <= mean]
281
-    left_Xtrain = Xtrain[Xtrain[:, i] <= mean]
285
+    left_position = Xtrain[:, i] <= mean
286
+    left_Ytrain = Ytrain[left_position]
287
+    left_Xtrain = Xtrain[left_position]
282 288
     # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
283
-    left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[Xtrain[:, i] <= mean])
289
+    left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[left_position])
284 290
 
285 291
     parent_node.left = left_node
286 292
     parent_node.right = right_node
@@ -334,7 +340,7 @@ if __name__ == '__main__':
334 340
 
335 341
     print("基尼指数", cal_gini(Ytrain))
336 342
 
337
-    print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
343
+    print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))
338 344
 
339 345
     node = fit(Xtrain, Ytrain, None, 0, weights)
340 346
     print_width([node], 1)