Browse Source

使用重采样法

yufeng0528 4 years ago
parent
commit
4311d0b82a
2 changed files with 14 additions and 27 deletions
  1. 2 2
      integr/my_ada_boost.py
  2. 12 25
      tree/my_tree.py

+ 2 - 2
integr/my_ada_boost.py

@@ -28,9 +28,9 @@ def fit(Xtrain, Ytrain):
28 28
     pn = 1/len(Ytrain)
29 29
     # print(Ytest)
30 30
 
31
-    for i in range(20):
31
+    for i in range(30):
32 32
         # 训练决策树
33
-        clf = MyDT(criterion="entropy", max_features=1, max_depth=1,
33
+        clf = MyDT(criterion="gini", max_features=1, max_depth=1,
34 34
                                      )  # 实例化,criterion不写的话默认是基尼系数
35 35
 
36 36
         clf.fit(Xtrain, Ytrain, w)

+ 12 - 25
tree/my_tree.py

@@ -149,28 +149,19 @@ def cal_ent_attr(Xtrain, Ytrain):
149 149
     return min_i, min_mean, min_ent
150 150
 
151 151
 
152
-def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
152
+def cal_max_ent_attr_c45(Xtrain, Ytrain):
153 153
     max_ent = 0
154 154
     max_mean = 0
155
-    weights = weights / sum(weights)
156
-    h = calc_ent(Ytrain, weights)
155
+    h = calc_ent(Ytrain)
157 156
     p = 0
158 157
     for k in range(0, len(Xtrain) - 1, 3):
159 158
         left = Xtrain[:k + 1]
160 159
         right = Xtrain[k + 1:]
161 160
 
162
-        if weights is None:
163
-            left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
164
-            right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
165
-            iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
166
-            iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
167
-        else:
168
-            p += weights[k]
169
-            left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
170
-            right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
171
-            iv = -p * np.log2(p)
172
-            iv -= (1-p) * np.log2(1-p)
173
-
161
+        left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
162
+        right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
163
+        iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
164
+        iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
174 165
 
175 166
         gain_ent = (h - left_ent - right_ent)/iv
176 167
 
@@ -179,21 +170,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
179 170
             max_mean = left[-1]
180 171
     return  max_ent, max_mean
181 172
 
182
-# 样本权重
183
-weights = []
184
-
185 173
 # 计算某个属性的信息增益率
186
-def cal_ent_attr_c45(Xtrain, Ytrain, weights):
174
+def cal_ent_attr_c45(Xtrain, Ytrain):
187 175
     # 对每个属性
188 176
     max_ent = 0
189 177
     max_i = 0
190 178
     max_mean = 0
191
-    weights = weights / sum(weights)
192 179
     for i in range(Xtrain.shape[1]): #每个属性
193 180
         argsort = Xtrain[:,i].argsort()
194
-        x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]
181
+        x,y = Xtrain[:,i][argsort], Ytrain[argsort]
195 182
 
196
-        gain_ent, mean = cal_max_ent_attr_c45(x, y, w)
183
+        gain_ent, mean = cal_max_ent_attr_c45(x, y)
197 184
 
198 185
         if gain_ent > max_ent:
199 186
             max_ent = gain_ent
@@ -385,13 +372,13 @@ if __name__ == '__main__':
385 372
 
386 373
     weights = np.ones(len(Ytrain))/Ytrain.shape[0]
387 374
     print("熵值", calc_ent(Ytrain))
388
-    print("熵值", calc_ent(Ytrain, weights))
375
+    print("熵值", calc_ent(Ytrain))
389 376
 
390 377
     print("基尼指数", cal_gini(Ytrain))
391 378
 
392
-    print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))
379
+    print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
393 380
 
394
-    clf = MyDT(criterion="entropy", max_depth=1,)
381
+    clf = MyDT(criterion="C4.5", max_depth=1,)
395 382
     clf.fit(Xtrain, Ytrain, weights)
396 383
 
397 384
     # print_width([node], 1)