|
@@ -3,99 +3,97 @@ from sklearn.datasets import load_wine, load_iris, load_breast_cancer
|
3
|
3
|
from sklearn.model_selection import train_test_split
|
4
|
4
|
from sklearn.tree import DecisionTreeClassifier
|
5
|
5
|
import numpy as np
|
6
|
|
-from tree import my_tree
|
7
|
|
-
|
8
|
|
-wine = load_breast_cancer()
|
9
|
|
-Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
|
10
|
|
-
|
11
|
|
-# 设置数据初始权重
|
12
|
|
-w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
|
13
|
|
-# 决策树权重
|
14
|
|
-alphas = []
|
15
|
|
-# 决策树数组
|
16
|
|
-trees = []
|
17
|
|
-pn = 1/len(Ytrain)
|
18
|
|
-
|
19
|
|
-for i in range(len(Ytrain)):
|
20
|
|
- if Ytrain[i] == 0:
|
21
|
|
- Ytrain[i] = -1
|
22
|
|
-for i in range(len(Ytest)):
|
23
|
|
- if Ytest[i] == 0:
|
24
|
|
- Ytest[i] = -1
|
25
|
|
-print(Ytest)
|
26
|
|
-
|
27
|
|
-for i in range(20):
|
28
|
|
- # 训练决策树
|
29
|
|
- clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1,
|
30
|
|
- ) # 实例化,criterion不写的话默认是基尼系数
|
31
|
|
-
|
32
|
|
- clf.fit(Xtrain, Ytrain, w)
|
33
|
|
- # nodes = my_tree.fit(Xtrain, Ytrain, None, 0, w)
|
34
|
|
-
|
35
|
|
- # my_tree.print_width([nodes], 1)
|
36
|
|
- # print("熵值", my_tree.calc_ent(Ytrain, w))
|
37
|
|
- # Xpredit = my_tree.predict(Xtrain, Ytrain, nodes)
|
38
|
|
- Xpredit = clf.predict(Xtrain)
|
39
|
|
- error = 0
|
40
|
|
- p_error = 0
|
41
|
|
- for j in range(len(Ytrain)):
|
42
|
|
- if Xpredit[j] != Ytrain[j]:
|
43
|
|
- error += w[j]
|
44
|
|
- p_error += pn
|
45
|
|
-
|
46
|
|
- if error > 0.5:
|
47
|
|
- continue
|
48
|
|
- if error == 0:
|
49
|
|
- error = 0.001
|
50
|
|
-
|
51
|
|
- print("第", i, "轮错误率", p_error, error)
|
52
|
|
- alpha = 0.5*np.log2(1/error - 1)
|
53
|
|
-
|
54
|
|
- # 更新权重
|
55
|
|
- for j in range(Ytrain.shape[0]):
|
56
|
|
- w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Xpredit[j])
|
57
|
|
- sum_w = sum(w)
|
58
|
|
- w = w/sum_w
|
59
|
|
-
|
60
|
|
- alphas.append(alpha)
|
61
|
|
- trees.append(clf)
|
62
|
|
-
|
63
|
|
-predicts = []
|
64
|
|
-for tree in trees:
|
65
|
|
- predicts.append(tree.predict(Xtrain))
|
66
|
|
-
|
67
|
|
-print(alphas)
|
68
|
|
-# 结果加权
|
69
|
|
-result = np.zeros(len(Xtrain), float)
|
70
|
|
-for p in predicts:
|
71
|
|
- r = 0
|
72
|
|
- for w_alpha in alphas:
|
73
|
|
- r += w_alpha * p
|
74
|
|
- result = result + r
|
75
|
|
-
|
76
|
|
-print("sign前:" , result)
|
77
|
|
-result = np.sign(result)
|
78
|
|
-print("sign后:", result)
|
79
|
|
-
|
80
|
|
-# print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
|
81
|
|
-# print(result == Ytest)
|
82
|
|
-print()
|
83
|
|
-x = len([i for i in result == Ytrain if i])/len(result)
|
84
|
|
-x = x + 0.5*len([i for i in result if i==0])/len(result)
|
85
|
|
-print(x)
|
86
|
|
-# cmp = np.concatenate(([result], [Ytest]), axis=0)
|
87
|
|
-# print(cmp)
|
88
|
|
-
|
89
|
|
-
|
90
|
|
-# clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=2)
|
91
|
|
-# clf = clf.fit(Xtrain, Ytrain)
|
92
|
|
-# print(clf.score(Xtest, Ytest))
|
93
|
|
-
|
94
|
|
-# w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
|
95
|
|
-# my_tree_0 = trees[0]
|
96
|
|
-Ypredit = trees[0].predict(Xtrain)
|
97
|
|
-error = 0
|
98
|
|
-for j in range(len(Xtrain)):
|
99
|
|
- if Ypredit[j] != Ytrain[j]:
|
100
|
|
- error += w[j]
|
101
|
|
-print(1-error)
|
|
6
|
+from tree.my_tree import MyDT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+def read_data():
|
|
10
|
+ wine = load_breast_cancer()
|
|
11
|
+ Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
|
|
12
|
+ for i in range(len(Ytrain)):
|
|
13
|
+ if Ytrain[i] == 0:
|
|
14
|
+ Ytrain[i] = -1
|
|
15
|
+ for i in range(len(Ytest)):
|
|
16
|
+ if Ytest[i] == 0:
|
|
17
|
+ Ytest[i] = -1
|
|
18
|
+ return Xtrain, Xtest, Ytrain, Ytest
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+def fit(Xtrain, Ytrain):
|
|
22
|
+ # 设置数据初始权重
|
|
23
|
+ w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
|
|
24
|
+ # 决策树权重
|
|
25
|
+ alphas = []
|
|
26
|
+ # 决策树数组
|
|
27
|
+ trees = []
|
|
28
|
+ pn = 1/len(Ytrain)
|
|
29
|
+ print(Ytest)
|
|
30
|
+
|
|
31
|
+ for i in range(20):
|
|
32
|
+ # 训练决策树
|
|
33
|
+ clf = MyDT(criterion="entropy", max_features=1, max_depth=1,
|
|
34
|
+ ) # 实例化,criterion不写的话默认是基尼系数
|
|
35
|
+
|
|
36
|
+ clf.fit(Xtrain, Ytrain, w)
|
|
37
|
+ Xpredit = clf.predict(Xtrain)
|
|
38
|
+
|
|
39
|
+ error = np.dot([Xpredit != Ytrain], w)
|
|
40
|
+ p_error = sum(Xpredit != Ytrain)/Ytrain.shape[0]
|
|
41
|
+
|
|
42
|
+ if error > 0.5:
|
|
43
|
+ continue
|
|
44
|
+ if error == 0:
|
|
45
|
+ error = 0.001
|
|
46
|
+
|
|
47
|
+ print("第", i, "轮错误率", p_error, error)
|
|
48
|
+ alpha = 0.5*np.log2(1/error - 1)
|
|
49
|
+
|
|
50
|
+ # 更新权重
|
|
51
|
+ for j in range(Ytrain.shape[0]):
|
|
52
|
+ w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Xpredit[j])
|
|
53
|
+ sum_w = sum(w)
|
|
54
|
+ w = w/sum_w
|
|
55
|
+
|
|
56
|
+ alphas.append(alpha)
|
|
57
|
+ trees.append(clf)
|
|
58
|
+ return trees, alphas
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+def cmp_predict(trees, alphas, Xtrain, Ytrain):
|
|
62
|
+ predicts = []
|
|
63
|
+ for tree in trees:
|
|
64
|
+ predicts.append(tree.predict(Xtrain))
|
|
65
|
+
|
|
66
|
+ # 结果加权
|
|
67
|
+ result = np.zeros(len(Xtrain), float)
|
|
68
|
+ for p in predicts:
|
|
69
|
+ r = 0
|
|
70
|
+ for w_alpha in alphas:
|
|
71
|
+ r += w_alpha * p
|
|
72
|
+ result = result + r
|
|
73
|
+
|
|
74
|
+ # print("sign前:" , result)
|
|
75
|
+ result = np.sign(result)
|
|
76
|
+ # print("sign后:", result)
|
|
77
|
+
|
|
78
|
+ # print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
|
|
79
|
+ # print(result == Ytest)
|
|
80
|
+ x = len([i for i in result == Ytrain if i])/len(result)
|
|
81
|
+ x = x + 0.5*len([i for i in result if i==0])/len(result)
|
|
82
|
+ print(x)
|
|
83
|
+
|
|
84
|
+ w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
|
|
85
|
+ Ypredit = trees[0].predict(Xtrain)
|
|
86
|
+ p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0]
|
|
87
|
+ print(1-p_error)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+if __name__ == '__main__':
|
|
91
|
+ Xtrain, Xtest, Ytrain, Ytest = read_data()
|
|
92
|
+ trees, alphas = fit(Xtrain, Ytrain)
|
|
93
|
+
|
|
94
|
+ print(alphas)
|
|
95
|
+
|
|
96
|
+ print('训练集上比较')
|
|
97
|
+ cmp_predict(trees, alphas, Xtrain, Ytrain)
|
|
98
|
+ print('测试集上比较')
|
|
99
|
+ cmp_predict(trees, alphas, Xtest, Ytest)
|