Browse Source

使用C4.5算法

yufeng0528 4 years ago
parent
commit
612f733262
2 changed files with 36 additions and 20 deletions
  1. 18 6
      integr/my_ada_boost.py
  2. 18 14
      tree/my_tree.py

+ 18 - 6
integr/my_ada_boost.py

@@ -24,13 +24,13 @@ for i in range(len(Ytest)):
24 24
         Ytest[i] = -1
25 25
 print(Ytest)
26 26
 
27
-for i in range(30):
27
+for i in range(3):
28 28
     # 训练决策树
29 29
     # clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1,
30 30
     #                              class_weight={0:w0, 1:1-w0})  # 实例化,criterion不写的话默认是基尼系数
31 31
     nodes = my_tree.fit(Xtrain, Ytrain, None, 0, w)
32 32
 
33
-    # my_tree.print_width([nodes], 1)
33
+    my_tree.print_width([nodes], 1)
34 34
     # print("熵值", my_tree.calc_ent(Ytrain, w))
35 35
     Xpredit = my_tree.predict(Xtrain, Ytrain, nodes)
36 36
     error = 0
@@ -76,11 +76,23 @@ print("sign后:", result)
76 76
 
77 77
 # print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
78 78
 # print(result == Ytest)
79
-print(len([i for i in result == Ytest if i])/len(result))
79
+print()
80
+x = len([i for i in result == Ytest if i])/len(result)
81
+x = x + 0.5*len([i for i in result if i==0])/len(result)
82
+print(x)
80 83
 # cmp = np.concatenate(([result], [Ytest]), axis=0)
81 84
 # print(cmp)
82 85
 
83 86
 
84
-clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1)
85
-clf = clf.fit(Xtrain, Ytrain)
86
-print(clf.score(Xtest, Ytest))
87
+# clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=2)
88
+# clf = clf.fit(Xtrain, Ytrain)
89
+# print(clf.score(Xtest, Ytest))
90
+
91
+w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
92
+my_tree_0 = trees[0]
93
+Xpredit = my_tree.predict(Xtest, None, nodes)
94
+error = 0
95
+for j in range(len(Xtest)):
96
+    if Xpredit[j] != Ytest[j]:
97
+        error += w[j]
98
+print(1-error)

+ 18 - 14
tree/my_tree.py

@@ -32,8 +32,8 @@ class TreeNode(object):
32 32
         self.left = left
33 33
         self.right = right
34 34
 
35
-        # if self.y == -1:
36
-        #     self.y = np.where(value == np.max(value))[0][0]
35
+        if self.y is None:
36
+            self.y = np.where(value == np.max(value))[0][0] ## TODO
37 37
             # print(self.y, self.value)
38 38
 
39 39
     def __str__(self):
@@ -121,7 +121,7 @@ def cal_ent_attr(Xtrain, Ytrain, weights):
121 121
     min_i = 0
122 122
     min_mean = 0
123 123
 
124
-    for i in np.random.randint(0,Xtrain.shape[1],size=(15)):
124
+    for i in range(Xtrain.shape[1]):
125 125
         x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
126 126
         mean = sum(x_value_list)/len(x_value_list)
127 127
         sum_ent = 0
@@ -142,6 +142,7 @@ def cal_ent_attr(Xtrain, Ytrain, weights):
142 142
 def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
143 143
     max_ent = 0
144 144
     max_mean = 0
145
+    weights = weights / sum(weights)
145 146
     h = calc_ent(Ytrain)
146 147
     for k in range(len(Xtrain) - 1):
147 148
         left = Xtrain[:k + 1]
@@ -150,12 +151,14 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
150 151
         if weights is None:
151 152
             left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
152 153
             right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
153
-
154
+            iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
155
+            iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
154 156
         else:
155
-            pass
156
-
157
-        iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
158
-        iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
157
+            p = sum(weights[:k+1])
158
+            left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
159
+            right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
160
+            iv = -p * np.log2(p)
161
+            iv -= (1-p) * np.log2(1-p)
159 162
 
160 163
 
161 164
         gain_ent = (h - left_ent - right_ent)/iv
@@ -169,16 +172,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
169 172
 weights = []
170 173
 
171 174
 # 计算某个属性的信息增益率
172
-def cal_ent_attr_c45(Xtrain, Ytrain):
175
+def cal_ent_attr_c45(Xtrain, Ytrain, weights):
173 176
     # 对每个属性
174 177
     max_ent = 0
175 178
     max_i = 0
176 179
     max_mean = 0
180
+    weights = weights / sum(weights)
177 181
     for i in range(Xtrain.shape[1]): #每个属性
178 182
         argsort = Xtrain[:,i].argsort()
179
-        x,y = Xtrain[:,i][argsort], Ytrain[argsort]
183
+        x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]
180 184
 
181
-        gain_ent, mean = cal_max_ent_attr_c45(x, y)
185
+        gain_ent, mean = cal_max_ent_attr_c45(x, y, w)
182 186
 
183 187
         if gain_ent > max_ent:
184 188
             max_ent = gain_ent
@@ -255,16 +259,16 @@ def fit(Xtrain, Ytrain, parent_node, depth, weights):
255 259
 
256 260
     if is_end(Ytrain):
257 261
         # print('这个是叶子节点')
258
-        return TreeNode(-1, 0, 0, True, -1, len(Ytrain), distrib(Ytrain))
262
+        return leaf_node(Ytrain, weights)
259 263
 
260 264
     if depth >= MAX_T:
261 265
         return leaf_node(Ytrain, weights)
262 266
 
263
-    i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
267
+    i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
264 268
     total_ent = calc_ent(Ytrain)
265 269
     # print("第", i, "个属性,mean:", mean)
266 270
     # 生成节点
267
-    parent_node = TreeNode(i, mean, total_ent - min_ent, False, -1, len(Ytrain), distrib(Ytrain))
271
+    parent_node = TreeNode(i, mean, total_ent - min_ent, False, -2, len(Ytrain), distrib(Ytrain))
268 272
 
269 273
     # 切分数据
270 274
     right_Ytrain = Ytrain[Xtrain[:, i] > mean]