yufeng0528 4 years ago
parent
commit
5c4516919f
3 changed files with 96 additions and 98 deletions
  1. 94 96
      integr/my_ada_boost.py
  2. 1 1
      tree/example.py
  3. 1 1
      tree/my_tree.py

+ 94 - 96
integr/my_ada_boost.py

@@ -3,99 +3,97 @@ from sklearn.datasets import load_wine, load_iris, load_breast_cancer
3 3
 from sklearn.model_selection import train_test_split
4 4
 from sklearn.tree import DecisionTreeClassifier
5 5
 import numpy as np
6
-from tree import my_tree
7
-
8
-wine = load_breast_cancer()
9
-Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
10
-
11
-# 设置数据初始权重
12
-w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
13
-# 决策树权重
14
-alphas = []
15
-# 决策树数组
16
-trees = []
17
-pn = 1/len(Ytrain)
18
-
19
-for i in range(len(Ytrain)):
20
-    if Ytrain[i] == 0:
21
-        Ytrain[i] = -1
22
-for i in range(len(Ytest)):
23
-    if Ytest[i] == 0:
24
-        Ytest[i] = -1
25
-print(Ytest)
26
-
27
-for i in range(20):
28
-    # 训练决策树
29
-    clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1,
30
-                                 )  # 实例化,criterion不写的话默认是基尼系数
31
-
32
-    clf.fit(Xtrain, Ytrain, w)
33
-    # nodes = my_tree.fit(Xtrain, Ytrain, None, 0, w)
34
-
35
-    # my_tree.print_width([nodes], 1)
36
-    # print("熵值", my_tree.calc_ent(Ytrain, w))
37
-    # Xpredit = my_tree.predict(Xtrain, Ytrain, nodes)
38
-    Xpredit = clf.predict(Xtrain)
39
-    error = 0
40
-    p_error = 0
41
-    for j in range(len(Ytrain)):
42
-        if Xpredit[j] != Ytrain[j]:
43
-            error += w[j]
44
-            p_error += pn
45
-
46
-    if error > 0.5:
47
-        continue
48
-    if error == 0:
49
-        error = 0.001
50
-
51
-    print("第", i, "轮错误率", p_error, error)
52
-    alpha = 0.5*np.log2(1/error - 1)
53
-
54
-    # 更新权重
55
-    for j in range(Ytrain.shape[0]):
56
-        w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Xpredit[j])
57
-    sum_w = sum(w)
58
-    w = w/sum_w
59
-
60
-    alphas.append(alpha)
61
-    trees.append(clf)
62
-
63
-predicts = []
64
-for tree in trees:
65
-    predicts.append(tree.predict(Xtrain))
66
-
67
-print(alphas)
68
-# 结果加权
69
-result = np.zeros(len(Xtrain), float)
70
-for p in predicts:
71
-    r = 0
72
-    for w_alpha in alphas:
73
-        r += w_alpha * p
74
-    result = result + r
75
-
76
-print("sign前:" , result)
77
-result = np.sign(result)
78
-print("sign后:", result)
79
-
80
-# print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
81
-# print(result == Ytest)
82
-print()
83
-x = len([i for i in result == Ytrain if i])/len(result)
84
-x = x + 0.5*len([i for i in result if i==0])/len(result)
85
-print(x)
86
-# cmp = np.concatenate(([result], [Ytest]), axis=0)
87
-# print(cmp)
88
-
89
-
90
-# clf = DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=2)
91
-# clf = clf.fit(Xtrain, Ytrain)
92
-# print(clf.score(Xtest, Ytest))
93
-
94
-# w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
95
-# my_tree_0 = trees[0]
96
-Ypredit = trees[0].predict(Xtrain)
97
-error = 0
98
-for j in range(len(Xtrain)):
99
-    if Ypredit[j] != Ytrain[j]:
100
-        error += w[j]
101
-print(1-error)
6
+from tree.my_tree import MyDT
7
+
8
+
9
+def read_data():
10
+    wine = load_breast_cancer()
11
+    Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
12
+    for i in range(len(Ytrain)):
13
+        if Ytrain[i] == 0:
14
+            Ytrain[i] = -1
15
+    for i in range(len(Ytest)):
16
+        if Ytest[i] == 0:
17
+            Ytest[i] = -1
18
+    return Xtrain, Xtest, Ytrain, Ytest
19
+
20
+
21
+def fit(Xtrain, Ytrain):
22
+    # 设置数据初始权重
23
+    w = np.array([1/len(Ytrain) for i in range(len(Ytrain))])
24
+    # 决策树权重
25
+    alphas = []
26
+    # 决策树数组
27
+    trees = []
28
+    pn = 1/len(Ytrain)
29
+    print(Ytest)
30
+
31
+    for i in range(20):
32
+        # 训练决策树
33
+        clf = MyDT(criterion="entropy", max_features=1, max_depth=1,
34
+                                     )  # 实例化,criterion不写的话默认是基尼系数
35
+
36
+        clf.fit(Xtrain, Ytrain, w)
37
+        Xpredit = clf.predict(Xtrain)
38
+
39
+        error = np.dot([Xpredit != Ytrain], w)
40
+        p_error = sum(Xpredit != Ytrain)/Ytrain.shape[0]
41
+
42
+        if error > 0.5:
43
+            continue
44
+        if error == 0:
45
+            error = 0.001
46
+
47
+        print("第", i, "轮错误率", p_error, error)
48
+        alpha = 0.5*np.log2(1/error - 1)
49
+
50
+        # 更新权重
51
+        for j in range(Ytrain.shape[0]):
52
+            w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Xpredit[j])
53
+        sum_w = sum(w)
54
+        w = w/sum_w
55
+
56
+        alphas.append(alpha)
57
+        trees.append(clf)
58
+    return trees, alphas
59
+
60
+
61
+def cmp_predict(trees, alphas, Xtrain, Ytrain):
62
+    predicts = []
63
+    for tree in trees:
64
+        predicts.append(tree.predict(Xtrain))
65
+
66
+    # 结果加权
67
+    result = np.zeros(len(Xtrain), float)
68
+    for p in predicts:
69
+        r = 0
70
+        for w_alpha in alphas:
71
+            r += w_alpha * p
72
+        result = result + r
73
+
74
+    # print("sign前:" , result)
75
+    result = np.sign(result)
76
+    # print("sign后:", result)
77
+
78
+    # print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
79
+    # print(result == Ytest)
80
+    x = len([i for i in result == Ytrain if i])/len(result)
81
+    x = x + 0.5*len([i for i in result if i==0])/len(result)
82
+    print(x)
83
+
84
+    w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
85
+    Ypredit = trees[0].predict(Xtrain)
86
+    p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0]
87
+    print(1-p_error)
88
+
89
+
90
+if __name__ == '__main__':
91
+    Xtrain, Xtest, Ytrain, Ytest = read_data()
92
+    trees, alphas = fit(Xtrain, Ytrain)
93
+
94
+    print(alphas)
95
+
96
+    print('训练集上比较')
97
+    cmp_predict(trees, alphas, Xtrain, Ytrain)
98
+    print('测试集上比较')
99
+    cmp_predict(trees, alphas, Xtest, Ytest)

+ 1 - 1
tree/example.py

@@ -20,7 +20,7 @@ print(wine.target_names)
20 20
 Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
21 21
 numpy.savetxt("foo.csv", Xtrain, delimiter=",")
22 22
 
23
-clf = tree.DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=5)#实例化,criterion不写的话默认是基尼系数
23
+clf = tree.DecisionTreeClassifier(criterion="entropy", max_features=1, max_depth=1)#实例化,criterion不写的话默认是基尼系数
24 24
 # clf.n_features_ = 2
25 25
 clf = clf.fit(Xtrain, Ytrain)
26 26
 score = clf.score(Xtest, Ytest) #返回预测的准确度

+ 1 - 1
tree/my_tree.py

@@ -304,7 +304,7 @@ class MyDT(object):
304 304
 
305 305
     root_node = None
306 306
 
307
-    def __init__(self, criterion, max_depth):
307
+    def __init__(self, criterion, max_depth, max_features=1):
308 308
         self.criterion = criterion
309 309
         self.max_depth = max_depth
310 310