123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # -*- encoding:utf-8 -*-
- from sklearn.datasets import load_wine, load_iris, load_breast_cancer
- from sklearn.model_selection import train_test_split
- from sklearn.tree import DecisionTreeClassifier
- import numpy as np
- from tree.my_tree import MyDT
- def read_data():
- wine = load_breast_cancer()
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
- for i in range(len(Ytrain)):
- if Ytrain[i] == 0:
- Ytrain[i] = -1
- for i in range(len(Ytest)):
- if Ytest[i] == 0:
- Ytest[i] = -1
- return Xtrain, Xtest, Ytrain, Ytest
- def fit(Xtrain, Ytrain):
- # 设置数据初始权重
- w = np.ones(len(Ytrain))/Ytrain.shape[0]
- # 决策树权重
- alphas = []
- # 决策树数组
- trees = []
- pn = 1/len(Ytrain)
- # print(Ytest)
- for i in range(30):
- # 训练决策树
- clf = MyDT(criterion="gini", max_features=1, max_depth=1,
- ) # 实例化,criterion不写的话默认是基尼系数
- clf.fit(Xtrain, Ytrain, w)
- Ypredit = clf.predict(Xtrain)
- error = np.dot(Ypredit != Ytrain, w)
- p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0]
- if error > 0.5:
- continue
- if error == 0:
- error = 0.001
- print("第", i, "轮错误率", p_error, error)
- alpha = 0.5*np.log2(1/error - 1)
- # 更新权重
- for j in range(Ytrain.shape[0]):
- w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Ypredit[j])
- sum_w = sum(w)
- w = w/sum_w
- alphas.append(alpha)
- trees.append(clf)
- return trees, alphas
- def cmp_predict(trees, alphas, Xtrain, Ytrain):
- predicts = []
- for tree in trees:
- predicts.append(tree.predict(Xtrain))
- # 结果加权
- result = np.zeros(len(Xtrain), float)
- for p in predicts:
- r = 0
- for w_alpha in alphas:
- r += w_alpha * p
- result = result + r
- # print("sign前:" , result)
- result = np.sign(result)
- # print("sign后:", result)
- # print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
- # print(result == Ytest)
- x = len([i for i in result == Ytrain if i])/len(result)
- x = x + 0.5*len([i for i in result if i==0])/len(result)
- print(x)
- w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
- Ypredit = trees[0].predict(Xtrain)
- p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0]
- print(1-p_error)
- if __name__ == '__main__':
- Xtrain, Xtest, Ytrain, Ytest = read_data()
- trees, alphas = fit(Xtrain, Ytrain)
- print(alphas)
- print('训练集上比较')
- cmp_predict(trees, alphas, Xtrain, Ytrain)
- print('测试集上比较')
- cmp_predict(trees, alphas, Xtest, Ytest)
|