|
@@ -64,11 +64,14 @@ def calc_ent(x, weights=None):
|
64
|
64
|
"""
|
65
|
65
|
x_value_list = set([x[i] for i in range(x.shape[0])])
|
66
|
66
|
ent = 0.0
|
|
67
|
+
|
|
68
|
+ if weights is not None:
|
|
69
|
+ weights = weights / sum(weights)
|
|
70
|
+
|
67
|
71
|
for x_value in x_value_list:
|
68
|
72
|
if weights is None:
|
69
|
73
|
p = float(x[x == x_value].shape[0]) / x.shape[0]
|
70
|
74
|
else:
|
71
|
|
- weights = weights/sum(weights)
|
72
|
75
|
p = sum(sum([x == x_value]*weights))
|
73
|
76
|
logp = np.log2(p)
|
74
|
77
|
ent -= p * logp
|
|
@@ -143,7 +146,8 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
143
|
146
|
max_ent = 0
|
144
|
147
|
max_mean = 0
|
145
|
148
|
weights = weights / sum(weights)
|
146
|
|
- h = calc_ent(Ytrain)
|
|
149
|
+ h = calc_ent(Ytrain, weights)
|
|
150
|
+ p = 0
|
147
|
151
|
for k in range(len(Xtrain) - 1):
|
148
|
152
|
left = Xtrain[:k + 1]
|
149
|
153
|
right = Xtrain[k + 1:]
|
|
@@ -154,7 +158,7 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
154
|
158
|
iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
|
155
|
159
|
iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
|
156
|
160
|
else:
|
157
|
|
- p = sum(weights[:k+1])
|
|
161
|
+ p += weights[k]
|
158
|
162
|
left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
|
159
|
163
|
right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
|
160
|
164
|
iv = -p * np.log2(p)
|
|
@@ -264,23 +268,25 @@ def fit(Xtrain, Ytrain, parent_node, depth, weights):
|
264
|
268
|
if depth >= MAX_T:
|
265
|
269
|
return leaf_node(Ytrain, weights)
|
266
|
270
|
|
267
|
|
- i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
|
268
|
|
- total_ent = calc_ent(Ytrain)
|
|
271
|
+ i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
|
|
272
|
+ total_ent = 0 # calc_ent(Ytrain)
|
269
|
273
|
# print("第", i, "个属性,mean:", mean)
|
270
|
274
|
# 生成节点
|
271
|
275
|
parent_node = TreeNode(i, mean, total_ent - min_ent, False, -2, len(Ytrain), distrib(Ytrain))
|
272
|
276
|
|
273
|
277
|
# 切分数据
|
274
|
|
- right_Ytrain = Ytrain[Xtrain[:, i] > mean]
|
275
|
|
- right_Xtrain = Xtrain[Xtrain[:, i] > mean]
|
|
278
|
+ right_position = Xtrain[:, i] > mean
|
|
279
|
+ right_Ytrain = Ytrain[right_position]
|
|
280
|
+ right_Xtrain = Xtrain[right_position]
|
276
|
281
|
# right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
|
277
|
282
|
|
278
|
|
- right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[Xtrain[:, i] > mean])
|
|
283
|
+ right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[right_position])
|
279
|
284
|
|
280
|
|
- left_Ytrain = Ytrain[Xtrain[:, i] <= mean]
|
281
|
|
- left_Xtrain = Xtrain[Xtrain[:, i] <= mean]
|
|
285
|
+ left_position = Xtrain[:, i] <= mean
|
|
286
|
+ left_Ytrain = Ytrain[left_position]
|
|
287
|
+ left_Xtrain = Xtrain[left_position]
|
282
|
288
|
# left_Xtrain = np.delete(left_Xtrain, i, axis=1)
|
283
|
|
- left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[Xtrain[:, i] <= mean])
|
|
289
|
+ left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[left_position])
|
284
|
290
|
|
285
|
291
|
parent_node.left = left_node
|
286
|
292
|
parent_node.right = right_node
|
|
@@ -334,7 +340,7 @@ if __name__ == '__main__':
|
334
|
340
|
|
335
|
341
|
print("基尼指数", cal_gini(Ytrain))
|
336
|
342
|
|
337
|
|
- print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
|
|
343
|
+ print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))
|
338
|
344
|
|
339
|
345
|
node = fit(Xtrain, Ytrain, None, 0, weights)
|
340
|
346
|
print_width([node], 1)
|