Browse Source

使用重采样法

yufeng0528 4 years ago
parent
commit
4311d0b82a
2 changed files with 14 additions and 27 deletions
  1. 2 2
      integr/my_ada_boost.py
  2. 12 25
      tree/my_tree.py

+ 2 - 2
integr/my_ada_boost.py

@@ -28,9 +28,9 @@ def fit(Xtrain, Ytrain):
28
     pn = 1/len(Ytrain)
28
     pn = 1/len(Ytrain)
29
     # print(Ytest)
29
     # print(Ytest)
30
 
30
 
31
-    for i in range(20):
31
+    for i in range(30):
32
         # 训练决策树
32
         # 训练决策树
33
-        clf = MyDT(criterion="entropy", max_features=1, max_depth=1,
33
+        clf = MyDT(criterion="gini", max_features=1, max_depth=1,
34
                                      )  # 实例化,criterion不写的话默认是基尼系数
34
                                      )  # 实例化,criterion不写的话默认是基尼系数
35
 
35
 
36
         clf.fit(Xtrain, Ytrain, w)
36
         clf.fit(Xtrain, Ytrain, w)

+ 12 - 25
tree/my_tree.py

@@ -149,28 +149,19 @@ def cal_ent_attr(Xtrain, Ytrain):
149
     return min_i, min_mean, min_ent
149
     return min_i, min_mean, min_ent
150
 
150
 
151
 
151
 
152
-def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
152
+def cal_max_ent_attr_c45(Xtrain, Ytrain):
153
     max_ent = 0
153
     max_ent = 0
154
     max_mean = 0
154
     max_mean = 0
155
-    weights = weights / sum(weights)
156
-    h = calc_ent(Ytrain, weights)
155
+    h = calc_ent(Ytrain)
157
     p = 0
156
     p = 0
158
     for k in range(0, len(Xtrain) - 1, 3):
157
     for k in range(0, len(Xtrain) - 1, 3):
159
         left = Xtrain[:k + 1]
158
         left = Xtrain[:k + 1]
160
         right = Xtrain[k + 1:]
159
         right = Xtrain[k + 1:]
161
 
160
 
162
-        if weights is None:
163
-            left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
164
-            right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
165
-            iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
166
-            iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
167
-        else:
168
-            p += weights[k]
169
-            left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
170
-            right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
171
-            iv = -p * np.log2(p)
172
-            iv -= (1-p) * np.log2(1-p)
173
-
161
+        left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
162
+        right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
163
+        iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
164
+        iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
174
 
165
 
175
         gain_ent = (h - left_ent - right_ent)/iv
166
         gain_ent = (h - left_ent - right_ent)/iv
176
 
167
 
@@ -179,21 +170,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
179
             max_mean = left[-1]
170
             max_mean = left[-1]
180
     return  max_ent, max_mean
171
     return  max_ent, max_mean
181
 
172
 
182
-# 样本权重
183
-weights = []
184
-
185
 # 计算某个属性的信息增益率
173
 # 计算某个属性的信息增益率
186
-def cal_ent_attr_c45(Xtrain, Ytrain, weights):
174
+def cal_ent_attr_c45(Xtrain, Ytrain):
187
     # 对每个属性
175
     # 对每个属性
188
     max_ent = 0
176
     max_ent = 0
189
     max_i = 0
177
     max_i = 0
190
     max_mean = 0
178
     max_mean = 0
191
-    weights = weights / sum(weights)
192
     for i in range(Xtrain.shape[1]): #每个属性
179
     for i in range(Xtrain.shape[1]): #每个属性
193
         argsort = Xtrain[:,i].argsort()
180
         argsort = Xtrain[:,i].argsort()
194
-        x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]
181
+        x,y = Xtrain[:,i][argsort], Ytrain[argsort]
195
 
182
 
196
-        gain_ent, mean = cal_max_ent_attr_c45(x, y, w)
183
+        gain_ent, mean = cal_max_ent_attr_c45(x, y)
197
 
184
 
198
         if gain_ent > max_ent:
185
         if gain_ent > max_ent:
199
             max_ent = gain_ent
186
             max_ent = gain_ent
@@ -385,13 +372,13 @@ if __name__ == '__main__':
385
 
372
 
386
     weights = np.ones(len(Ytrain))/Ytrain.shape[0]
373
     weights = np.ones(len(Ytrain))/Ytrain.shape[0]
387
     print("熵值", calc_ent(Ytrain))
374
     print("熵值", calc_ent(Ytrain))
388
-    print("熵值", calc_ent(Ytrain, weights))
375
+    print("熵值", calc_ent(Ytrain))
389
 
376
 
390
     print("基尼指数", cal_gini(Ytrain))
377
     print("基尼指数", cal_gini(Ytrain))
391
 
378
 
392
-    print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))
379
+    print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
393
 
380
 
394
-    clf = MyDT(criterion="entropy", max_depth=1,)
381
+    clf = MyDT(criterion="C4.5", max_depth=1,)
395
     clf.fit(Xtrain, Ytrain, weights)
382
     clf.fit(Xtrain, Ytrain, weights)
396
 
383
 
397
     # print_width([node], 1)
384
     # print_width([node], 1)