|
@@ -32,8 +32,8 @@ class TreeNode(object):
|
32
|
32
|
self.left = left
|
33
|
33
|
self.right = right
|
34
|
34
|
|
35
|
|
- # if self.y == -1:
|
36
|
|
- # self.y = np.where(value == np.max(value))[0][0]
|
|
35
|
+ if self.y is None:
|
|
36
|
+ self.y = np.where(value == np.max(value))[0][0] ## TODO
|
37
|
37
|
# print(self.y, self.value)
|
38
|
38
|
|
39
|
39
|
def __str__(self):
|
|
@@ -121,7 +121,7 @@ def cal_ent_attr(Xtrain, Ytrain, weights):
|
121
|
121
|
min_i = 0
|
122
|
122
|
min_mean = 0
|
123
|
123
|
|
124
|
|
- for i in np.random.randint(0,Xtrain.shape[1],size=(15)):
|
|
124
|
+ for i in range(Xtrain.shape[1]):
|
125
|
125
|
x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
|
126
|
126
|
mean = sum(x_value_list)/len(x_value_list)
|
127
|
127
|
sum_ent = 0
|
|
@@ -142,6 +142,7 @@ def cal_ent_attr(Xtrain, Ytrain, weights):
|
142
|
142
|
def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
143
|
143
|
max_ent = 0
|
144
|
144
|
max_mean = 0
|
|
145
|
+ weights = weights / sum(weights)
|
145
|
146
|
h = calc_ent(Ytrain)
|
146
|
147
|
for k in range(len(Xtrain) - 1):
|
147
|
148
|
left = Xtrain[:k + 1]
|
|
@@ -150,12 +151,14 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
150
|
151
|
if weights is None:
|
151
|
152
|
left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
|
152
|
153
|
right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
|
153
|
|
-
|
|
154
|
+ iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
|
|
155
|
+ iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
|
154
|
156
|
else:
|
155
|
|
- pass
|
156
|
|
-
|
157
|
|
- iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
|
158
|
|
- iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
|
|
157
|
+ p = sum(weights[:k+1])
|
|
158
|
+ left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
|
|
159
|
+ right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
|
|
160
|
+ iv = -p * np.log2(p)
|
|
161
|
+ iv -= (1-p) * np.log2(1-p)
|
159
|
162
|
|
160
|
163
|
|
161
|
164
|
gain_ent = (h - left_ent - right_ent)/iv
|
|
@@ -169,16 +172,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
169
|
172
|
weights = []
|
170
|
173
|
|
171
|
174
|
# 计算某个属性的信息增益率
|
172
|
|
-def cal_ent_attr_c45(Xtrain, Ytrain):
|
|
175
|
+def cal_ent_attr_c45(Xtrain, Ytrain, weights):
|
173
|
176
|
# 对每个属性
|
174
|
177
|
max_ent = 0
|
175
|
178
|
max_i = 0
|
176
|
179
|
max_mean = 0
|
|
180
|
+ weights = weights / sum(weights)
|
177
|
181
|
for i in range(Xtrain.shape[1]): #每个属性
|
178
|
182
|
argsort = Xtrain[:,i].argsort()
|
179
|
|
- x,y = Xtrain[:,i][argsort], Ytrain[argsort]
|
|
183
|
+ x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]
|
180
|
184
|
|
181
|
|
- gain_ent, mean = cal_max_ent_attr_c45(x, y)
|
|
185
|
+ gain_ent, mean = cal_max_ent_attr_c45(x, y, w)
|
182
|
186
|
|
183
|
187
|
if gain_ent > max_ent:
|
184
|
188
|
max_ent = gain_ent
|
|
@@ -255,16 +259,16 @@ def fit(Xtrain, Ytrain, parent_node, depth, weights):
|
255
|
259
|
|
256
|
260
|
if is_end(Ytrain):
|
257
|
261
|
# print('这个是叶子节点')
|
258
|
|
- return TreeNode(-1, 0, 0, True, -1, len(Ytrain), distrib(Ytrain))
|
|
262
|
+ return leaf_node(Ytrain, weights)
|
259
|
263
|
|
260
|
264
|
if depth >= MAX_T:
|
261
|
265
|
return leaf_node(Ytrain, weights)
|
262
|
266
|
|
263
|
|
- i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
|
|
267
|
+ i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
|
264
|
268
|
total_ent = calc_ent(Ytrain)
|
265
|
269
|
# print("第", i, "个属性,mean:", mean)
|
266
|
270
|
# 生成节点
|
267
|
|
- parent_node = TreeNode(i, mean, total_ent - min_ent, False, -1, len(Ytrain), distrib(Ytrain))
|
|
271
|
+ parent_node = TreeNode(i, mean, total_ent - min_ent, False, -2, len(Ytrain), distrib(Ytrain))
|
268
|
272
|
|
269
|
273
|
# 切分数据
|
270
|
274
|
right_Ytrain = Ytrain[Xtrain[:, i] > mean]
|