### 使用重采样法 yufeng0528 2 years ago
parent
commit
4311d0b82a
2 changed files with 14 additions and 27 deletions
1. 2 2
 ``@@ -28,9 +28,9 @@ def fit(Xtrain, Ytrain):`` 28 28 `` pn = 1/len(Ytrain)`` 29 29 `` # print(Ytest)`` 30 30 `` `` 31 ``- for i in range(20):`` 31 ``+ for i in range(30):`` 32 32 `` # 训练决策树`` 33 ``- clf = MyDT(criterion="entropy", max_features=1, max_depth=1,`` 33 ``+ clf = MyDT(criterion="gini", max_features=1, max_depth=1,`` 34 34 `` ) # 实例化，criterion不写的话默认是基尼系数`` 35 35 `` `` 36 36 `` clf.fit(Xtrain, Ytrain, w)``
 ``@@ -149,28 +149,19 @@ def cal_ent_attr(Xtrain, Ytrain):`` 149 149 `` return min_i, min_mean, min_ent`` 150 150 `` `` 151 151 `` `` 152 ``-def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):`` 152 ``+def cal_max_ent_attr_c45(Xtrain, Ytrain):`` 153 153 `` max_ent = 0`` 154 154 `` max_mean = 0`` 155 ``- weights = weights / sum(weights)`` 156 ``- h = calc_ent(Ytrain, weights)`` 155 ``+ h = calc_ent(Ytrain)`` 157 156 `` p = 0`` 158 157 `` for k in range(0, len(Xtrain) - 1, 3):`` 159 158 `` left = Xtrain[:k + 1]`` 160 159 `` right = Xtrain[k + 1:]`` 161 160 `` `` 162 ``- if weights is None:`` 163 ``- left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)`` 164 ``- right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)`` 165 ``- iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))`` 166 ``- iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))`` 167 ``- else:`` 168 ``- p += weights[k]`` 169 ``- left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p`` 170 ``- right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)`` 171 ``- iv = -p * np.log2(p)`` 172 ``- iv -= (1-p) * np.log2(1-p)`` 173 ``-`` 161 ``+ left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)`` 162 ``+ right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)`` 163 ``+ iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))`` 164 ``+ iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))`` 174 165 `` `` 175 166 `` gain_ent = (h - left_ent - right_ent)/iv`` 176 167 `` `` ``@@ -179,21 +170,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):`` 179 170 `` max_mean = left[-1]`` 180 171 `` return max_ent, max_mean`` 181 172 `` `` 182 ``-# 样本权重`` 183 ``-weights = []`` 184 ``-`` 185 173 `` # 计算某个属性的信息增益率`` 186 ``-def cal_ent_attr_c45(Xtrain, Ytrain, weights):`` 174 ``+def cal_ent_attr_c45(Xtrain, Ytrain):`` 187 175 `` # 对每个属性`` 188 176 `` max_ent = 0`` 189 177 `` max_i = 0`` 190 178 `` max_mean = 0`` 191 ``- weights = weights / sum(weights)`` 192 179 `` for i in range(Xtrain.shape): #每个属性`` 193 180 `` argsort = Xtrain[:,i].argsort()`` 194 ``- x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]`` 181 ``+ x,y = Xtrain[:,i][argsort], Ytrain[argsort]`` 195 182 `` `` 196 ``- gain_ent, mean = cal_max_ent_attr_c45(x, y, w)`` 183 ``+ gain_ent, mean = cal_max_ent_attr_c45(x, y)`` 197 184 `` `` 198 185 `` if gain_ent > max_ent:`` 199 186 `` max_ent = gain_ent`` ``@@ -385,13 +372,13 @@ if __name__ == '__main__':`` 385 372 `` `` 386 373 `` weights = np.ones(len(Ytrain))/Ytrain.shape`` 387 374 `` print("熵值", calc_ent(Ytrain))`` 388 ``- print("熵值", calc_ent(Ytrain, weights))`` 375 ``+ print("熵值", calc_ent(Ytrain))`` 389 376 `` `` 390 377 `` print("基尼指数", cal_gini(Ytrain))`` 391 378 `` `` 392 ``- print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))`` 379 ``+ print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))`` 393 380 `` `` 394 ``- clf = MyDT(criterion="entropy", max_depth=1,)`` 381 ``+ clf = MyDT(criterion="C4.5", max_depth=1,)`` 395 382 `` clf.fit(Xtrain, Ytrain, weights)`` 396 383 `` `` 397 384 `` # print_width([node], 1)``