|
@@ -149,28 +149,19 @@ def cal_ent_attr(Xtrain, Ytrain):
|
149
|
149
|
return min_i, min_mean, min_ent
|
150
|
150
|
|
151
|
151
|
|
152
|
|
-def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
|
152
|
+def cal_max_ent_attr_c45(Xtrain, Ytrain):
|
153
|
153
|
max_ent = 0
|
154
|
154
|
max_mean = 0
|
155
|
|
- weights = weights / sum(weights)
|
156
|
|
- h = calc_ent(Ytrain, weights)
|
|
155
|
+ h = calc_ent(Ytrain)
|
157
|
156
|
p = 0
|
158
|
157
|
for k in range(0, len(Xtrain) - 1, 3):
|
159
|
158
|
left = Xtrain[:k + 1]
|
160
|
159
|
right = Xtrain[k + 1:]
|
161
|
160
|
|
162
|
|
- if weights is None:
|
163
|
|
- left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
|
164
|
|
- right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
|
165
|
|
- iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
|
166
|
|
- iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
|
167
|
|
- else:
|
168
|
|
- p += weights[k]
|
169
|
|
- left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
|
170
|
|
- right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
|
171
|
|
- iv = -p * np.log2(p)
|
172
|
|
- iv -= (1-p) * np.log2(1-p)
|
173
|
|
-
|
|
161
|
+ left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
|
|
162
|
+ right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
|
|
163
|
+ iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
|
|
164
|
+ iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
|
174
|
165
|
|
175
|
166
|
gain_ent = (h - left_ent - right_ent)/iv
|
176
|
167
|
|
|
@@ -179,21 +170,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
179
|
170
|
max_mean = left[-1]
|
180
|
171
|
return max_ent, max_mean
|
181
|
172
|
|
182
|
|
-# 样本权重
|
183
|
|
-weights = []
|
184
|
|
-
|
185
|
173
|
# 计算某个属性的信息增益率
|
186
|
|
-def cal_ent_attr_c45(Xtrain, Ytrain, weights):
|
|
174
|
+def cal_ent_attr_c45(Xtrain, Ytrain):
|
187
|
175
|
# 对每个属性
|
188
|
176
|
max_ent = 0
|
189
|
177
|
max_i = 0
|
190
|
178
|
max_mean = 0
|
191
|
|
- weights = weights / sum(weights)
|
192
|
179
|
for i in range(Xtrain.shape[1]): #每个属性
|
193
|
180
|
argsort = Xtrain[:,i].argsort()
|
194
|
|
- x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]
|
|
181
|
+ x,y = Xtrain[:,i][argsort], Ytrain[argsort]
|
195
|
182
|
|
196
|
|
- gain_ent, mean = cal_max_ent_attr_c45(x, y, w)
|
|
183
|
+ gain_ent, mean = cal_max_ent_attr_c45(x, y)
|
197
|
184
|
|
198
|
185
|
if gain_ent > max_ent:
|
199
|
186
|
max_ent = gain_ent
|
|
@@ -385,13 +372,13 @@ if __name__ == '__main__':
|
385
|
372
|
|
386
|
373
|
weights = np.ones(len(Ytrain))/Ytrain.shape[0]
|
387
|
374
|
print("熵值", calc_ent(Ytrain))
|
388
|
|
- print("熵值", calc_ent(Ytrain, weights))
|
|
375
|
+ print("熵值", calc_ent(Ytrain))
|
389
|
376
|
|
390
|
377
|
print("基尼指数", cal_gini(Ytrain))
|
391
|
378
|
|
392
|
|
- print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))
|
|
379
|
+ print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
|
393
|
380
|
|
394
|
|
- clf = MyDT(criterion="entropy", max_depth=1,)
|
|
381
|
+ clf = MyDT(criterion="C4.5", max_depth=1,)
|
395
|
382
|
clf.fit(Xtrain, Ytrain, weights)
|
396
|
383
|
|
397
|
384
|
# print_width([node], 1)
|