|
@@ -124,9 +124,8 @@ def calc_ent1(x):
|
124
|
124
|
|
125
|
125
|
|
126
|
126
|
# 计算某个属性的信息增益
|
127
|
|
-def cal_ent_attr(Xtrain, Ytrain, weights):
|
|
127
|
+def cal_ent_attr(Xtrain, Ytrain):
|
128
|
128
|
# print('sharp', Xtrain.shape)
|
129
|
|
- weights = weights / sum(weights)
|
130
|
129
|
# 对每个属性
|
131
|
130
|
min_ent = 100
|
132
|
131
|
min_i = 0
|
|
@@ -138,10 +137,10 @@ def cal_ent_attr(Xtrain, Ytrain, weights):
|
138
|
137
|
sum_ent = 0
|
139
|
138
|
# 二叉树
|
140
|
139
|
p = Ytrain[Xtrain[:, i] > mean]
|
141
|
|
- p0 = sum(weights[Xtrain[:, i] > mean])
|
142
|
|
- sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] > mean])*p0
|
|
140
|
+ p0 = len(p)/Ytrain.shape[0]
|
|
141
|
+ sum_ent = sum_ent + calc_ent(p)*p0
|
143
|
142
|
p = Ytrain[Xtrain[:, i] <= mean]
|
144
|
|
- sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] <= mean])*(1-p0)
|
|
143
|
+ sum_ent = sum_ent + calc_ent(p)*(1-p0)
|
145
|
144
|
|
146
|
145
|
if sum_ent <= min_ent:
|
147
|
146
|
min_ent = sum_ent
|
|
@@ -226,8 +225,6 @@ def cal_gini_attr(Xtrain, Ytrain):
|
226
|
225
|
min_mean = mean
|
227
|
226
|
return min_i, min_mean, min_ent
|
228
|
227
|
|
229
|
|
-MAX_T = 1
|
230
|
|
-
|
231
|
228
|
|
232
|
229
|
def is_end(Ytrain):
|
233
|
230
|
if len(Ytrain) == 0:
|
|
@@ -236,16 +233,16 @@ def is_end(Ytrain):
|
236
|
233
|
return True
|
237
|
234
|
|
238
|
235
|
# 强行划分为叶子节点
|
239
|
|
-def leaf_node(Ytrain, weights):
|
|
236
|
+def leaf_node(Ytrain):
|
240
|
237
|
p_set = []
|
241
|
238
|
k = 0
|
242
|
239
|
for item in Ytrain:
|
243
|
240
|
for i in p_set:
|
244
|
241
|
if i[0] == item:
|
245
|
|
- i[1] = i[1] + weights[k]
|
|
242
|
+ i[1] = i[1] + 1
|
246
|
243
|
break
|
247
|
244
|
else:
|
248
|
|
- i = [item, weights[k]]
|
|
245
|
+ i = [item, 1]
|
249
|
246
|
p_set.append(i)
|
250
|
247
|
k = k + 1
|
251
|
248
|
|
|
@@ -309,25 +306,28 @@ class MyDT(object):
|
309
|
306
|
self.max_depth = max_depth
|
310
|
307
|
|
311
|
308
|
def fit(self, Xtrain, Ytrain, sample_weight=None):
|
312
|
|
- if sample_weight is None:
|
313
|
|
- sample_weight = np.ones(Ytrain.shape[0]) / Ytrain.shape[0]
|
314
|
|
- self.root_node = self.do_fit(Xtrain, Ytrain, 0, sample_weight)
|
|
309
|
+ if sample_weight is not None:
|
|
310
|
+ indices = [i for i in np.random.choice(Xtrain.shape[0], Ytrain.shape[0], p=sample_weight)]
|
|
311
|
+ Xtrain = Xtrain[indices]
|
|
312
|
+ Ytrain = Ytrain[indices]
|
|
313
|
+
|
|
314
|
+ self.root_node = self.do_fit(Xtrain, Ytrain, 0)
|
315
|
315
|
|
316
|
|
- def do_fit(self, Xtrain, Ytrain, depth, weights):
|
|
316
|
+ def do_fit(self, Xtrain, Ytrain, depth):
|
317
|
317
|
|
318
|
318
|
if is_end(Ytrain):
|
319
|
319
|
# print('这个是叶子节点')
|
320
|
|
- return leaf_node(Ytrain, weights)
|
|
320
|
+ return leaf_node(Ytrain)
|
321
|
321
|
|
322
|
322
|
if depth >= self.max_depth:
|
323
|
|
- return leaf_node(Ytrain, weights)
|
|
323
|
+ return leaf_node(Ytrain)
|
324
|
324
|
|
325
|
325
|
if self.criterion == 'entropy':
|
326
|
|
- i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
|
|
326
|
+ i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain)
|
327
|
327
|
elif self.criterion == 'C4.5':
|
328
|
|
- i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
|
|
328
|
+ i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
|
329
|
329
|
else:
|
330
|
|
- i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain, weights)
|
|
330
|
+ i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain)
|
331
|
331
|
total_ent = 0 # calc_ent(Ytrain)
|
332
|
332
|
# print("第", i, "个属性,mean:", mean)
|
333
|
333
|
# 生成节点
|
|
@@ -339,13 +339,13 @@ class MyDT(object):
|
339
|
339
|
right_Xtrain = Xtrain[right_position]
|
340
|
340
|
# right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
|
341
|
341
|
|
342
|
|
- right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1, weights[right_position])
|
|
342
|
+ right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1)
|
343
|
343
|
|
344
|
344
|
left_position = Xtrain[:, i] <= mean
|
345
|
345
|
left_Ytrain = Ytrain[left_position]
|
346
|
346
|
left_Xtrain = Xtrain[left_position]
|
347
|
347
|
# left_Xtrain = np.delete(left_Xtrain, i, axis=1)
|
348
|
|
- left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1, weights[left_position])
|
|
348
|
+ left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1)
|
349
|
349
|
|
350
|
350
|
parent_node.left = left_node
|
351
|
351
|
parent_node.right = right_node
|