# -*- encoding:utf-8 -*- from sklearn.datasets import load_wine, load_iris, load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier import numpy as np from tree.my_tree import MyDT def read_data(): wine = load_breast_cancer() Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3) for i in range(len(Ytrain)): if Ytrain[i] == 0: Ytrain[i] = -1 for i in range(len(Ytest)): if Ytest[i] == 0: Ytest[i] = -1 return Xtrain, Xtest, Ytrain, Ytest def fit(Xtrain, Ytrain): # 设置数据初始权重 w = np.ones(len(Ytrain))/Ytrain.shape[0] # 决策树权重 alphas = [] # 决策树数组 trees = [] pn = 1/len(Ytrain) # print(Ytest) for i in range(30): # 训练决策树 clf = MyDT(criterion="gini", max_features=1, max_depth=1, ) # 实例化,criterion不写的话默认是基尼系数 clf.fit(Xtrain, Ytrain, w) Ypredit = clf.predict(Xtrain) error = np.dot(Ypredit != Ytrain, w) p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0] if error > 0.5: continue if error == 0: error = 0.001 print("第", i, "轮错误率", p_error, error) alpha = 0.5*np.log2(1/error - 1) # 更新权重 for j in range(Ytrain.shape[0]): w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Ypredit[j]) sum_w = sum(w) w = w/sum_w alphas.append(alpha) trees.append(clf) return trees, alphas def cmp_predict(trees, alphas, Xtrain, Ytrain): predicts = [] for tree in trees: predicts.append(tree.predict(Xtrain)) # 结果加权 result = np.zeros(len(Xtrain), float) for p in predicts: r = 0 for w_alpha in alphas: r += w_alpha * p result = result + r # print("sign前:" , result) result = np.sign(result) # print("sign后:", result) # print(1- sum(np.bitwise_xor(Ytest, result))/len(result)) # print(result == Ytest) x = len([i for i in result == Ytrain if i])/len(result) x = x + 0.5*len([i for i in result if i==0])/len(result) print(x) w = np.array([1/len(Xtrain) for i in range(len(Xtrain))]) Ypredit = trees[0].predict(Xtrain) p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0] print(1-p_error) if __name__ == '__main__': Xtrain, Xtest, Ytrain, Ytest = read_data() trees, alphas = fit(Xtrain, Ytrain) print(alphas) print('训练集上比较') cmp_predict(trees, alphas, Xtrain, Ytrain) print('测试集上比较') cmp_predict(trees, alphas, Xtest, Ytest)