Browse Source

采用 重采样法

yufeng0528 4 years ago
parent
commit
37f09ba466
1 changed files with 21 additions and 21 deletions
  1. 21 21
      tree/my_tree.py

+ 21 - 21
tree/my_tree.py

@@ -124,9 +124,8 @@ def calc_ent1(x):
124 124
 
125 125
 
126 126
 # 计算某个属性的信息增益
127
-def cal_ent_attr(Xtrain, Ytrain, weights):
127
+def cal_ent_attr(Xtrain, Ytrain):
128 128
     # print('sharp', Xtrain.shape)
129
-    weights = weights / sum(weights)
130 129
     # 对每个属性
131 130
     min_ent = 100
132 131
     min_i = 0
@@ -138,10 +137,10 @@ def cal_ent_attr(Xtrain, Ytrain, weights):
138 137
         sum_ent = 0
139 138
         # 二叉树
140 139
         p = Ytrain[Xtrain[:, i] > mean]
141
-        p0 = sum(weights[Xtrain[:, i] > mean])
142
-        sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] > mean])*p0
140
+        p0 = len(p)/Ytrain.shape[0]
141
+        sum_ent = sum_ent + calc_ent(p)*p0
143 142
         p = Ytrain[Xtrain[:, i] <= mean]
144
-        sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] <= mean])*(1-p0)
143
+        sum_ent = sum_ent + calc_ent(p)*(1-p0)
145 144
 
146 145
         if sum_ent <= min_ent:
147 146
             min_ent = sum_ent
@@ -226,8 +225,6 @@ def cal_gini_attr(Xtrain, Ytrain):
226 225
             min_mean = mean
227 226
     return min_i, min_mean, min_ent
228 227
 
229
-MAX_T = 1
230
-
231 228
 
232 229
 def is_end(Ytrain):
233 230
     if len(Ytrain) == 0:
@@ -236,16 +233,16 @@ def is_end(Ytrain):
236 233
         return True
237 234
 
238 235
 # 强行划分为叶子节点
239
-def leaf_node(Ytrain, weights):
236
+def leaf_node(Ytrain):
240 237
     p_set = []
241 238
     k = 0
242 239
     for item in Ytrain:
243 240
         for i in p_set:
244 241
             if i[0] == item:
245
-                i[1] = i[1] + weights[k]
242
+                i[1] = i[1] + 1
246 243
                 break
247 244
         else:
248
-            i = [item, weights[k]]
245
+            i = [item, 1]
249 246
             p_set.append(i)
250 247
         k = k + 1
251 248
 
@@ -309,25 +306,28 @@ class MyDT(object):
309 306
         self.max_depth = max_depth
310 307
 
311 308
     def fit(self, Xtrain, Ytrain, sample_weight=None):
312
-        if sample_weight is None:
313
-            sample_weight = np.ones(Ytrain.shape[0]) / Ytrain.shape[0]
314
-        self.root_node = self.do_fit(Xtrain, Ytrain, 0, sample_weight)
309
+        if sample_weight is not None:
310
+            indices = [i for i in np.random.choice(Xtrain.shape[0], Ytrain.shape[0], p=sample_weight)]
311
+            Xtrain = Xtrain[indices]
312
+            Ytrain = Ytrain[indices]
313
+
314
+        self.root_node = self.do_fit(Xtrain, Ytrain, 0)
315 315
 
316
-    def do_fit(self, Xtrain, Ytrain, depth, weights):
316
+    def do_fit(self, Xtrain, Ytrain, depth):
317 317
 
318 318
         if is_end(Ytrain):
319 319
             # print('这个是叶子节点')
320
-            return leaf_node(Ytrain, weights)
320
+            return leaf_node(Ytrain)
321 321
 
322 322
         if depth >= self.max_depth:
323
-            return leaf_node(Ytrain, weights)
323
+            return leaf_node(Ytrain)
324 324
 
325 325
         if self.criterion == 'entropy':
326
-            i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
326
+            i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain)
327 327
         elif self.criterion == 'C4.5':
328
-            i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
328
+            i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
329 329
         else:
330
-            i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain, weights)
330
+            i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain)
331 331
         total_ent = 0  # calc_ent(Ytrain)
332 332
         # print("第", i, "个属性,mean:", mean)
333 333
         # 生成节点
@@ -339,13 +339,13 @@ class MyDT(object):
339 339
         right_Xtrain = Xtrain[right_position]
340 340
         # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
341 341
 
342
-        right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1, weights[right_position])
342
+        right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1)
343 343
 
344 344
         left_position = Xtrain[:, i] <= mean
345 345
         left_Ytrain = Ytrain[left_position]
346 346
         left_Xtrain = Xtrain[left_position]
347 347
         # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
348
-        left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1, weights[left_position])
348
+        left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1)
349 349
 
350 350
         parent_node.left = left_node
351 351
         parent_node.right = right_node