|
@@ -5,7 +5,8 @@ from sklearn.model_selection import train_test_split
|
5
|
5
|
import numpy as np
|
6
|
6
|
|
7
|
7
|
feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
|
8
|
|
- '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸']
|
|
8
|
+ '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸'
|
|
9
|
+ , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
|
9
|
10
|
class_names=["琴酒", "雪莉", "贝尔摩德"]
|
10
|
11
|
|
11
|
12
|
# 生成决策树的节点类型
|
|
@@ -31,9 +32,9 @@ class TreeNode(object):
|
31
|
32
|
self.left = left
|
32
|
33
|
self.right = right
|
33
|
34
|
|
34
|
|
- if self.y == -1:
|
35
|
|
- self.y = np.where(value == np.max(value))[0][0]
|
36
|
|
- print(self.y, self.value)
|
|
35
|
+ # if self.y == -1:
|
|
36
|
+ # self.y = np.where(value == np.max(value))[0][0]
|
|
37
|
+ # print(self.y, self.value)
|
37
|
38
|
|
38
|
39
|
def __str__(self):
|
39
|
40
|
if self.idx == -1:
|
|
@@ -57,14 +58,18 @@ def read_data():
|
57
|
58
|
return Xtrain, Xtest, Ytrain, Ytest
|
58
|
59
|
|
59
|
60
|
|
60
|
|
-def calc_ent(x):
|
|
61
|
+def calc_ent(x, weights=None):
|
61
|
62
|
"""
|
62
|
63
|
calculate shanno ent of x
|
63
|
64
|
"""
|
64
|
65
|
x_value_list = set([x[i] for i in range(x.shape[0])])
|
65
|
66
|
ent = 0.0
|
66
|
67
|
for x_value in x_value_list:
|
67
|
|
- p = float(x[x == x_value].shape[0]) / x.shape[0]
|
|
68
|
+ if weights is None:
|
|
69
|
+ p = float(x[x == x_value].shape[0]) / x.shape[0]
|
|
70
|
+ else:
|
|
71
|
+ weights = weights/sum(weights)
|
|
72
|
+ p = sum(sum([x == x_value]*weights))
|
68
|
73
|
logp = np.log2(p)
|
69
|
74
|
ent -= p * logp
|
70
|
75
|
|
|
@@ -108,31 +113,33 @@ def calc_ent1(x):
|
108
|
113
|
|
109
|
114
|
|
110
|
115
|
# 计算某个属性的信息增益
|
111
|
|
-def cal_ent_attr(Xtrain, Ytrain):
|
112
|
|
- print('sharp', Xtrain.shape)
|
113
|
|
-
|
|
116
|
+def cal_ent_attr(Xtrain, Ytrain, weights):
|
|
117
|
+ # print('sharp', Xtrain.shape)
|
|
118
|
+ weights = weights / sum(weights)
|
114
|
119
|
# 对每个属性
|
115
|
120
|
min_ent = 100
|
116
|
121
|
min_i = 0
|
117
|
122
|
min_mean = 0
|
118
|
|
- for i in range(Xtrain.shape[1]):
|
|
123
|
+
|
|
124
|
+ for i in np.random.randint(0,Xtrain.shape[1],size=(15)):
|
119
|
125
|
x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
|
120
|
126
|
mean = sum(x_value_list)/len(x_value_list)
|
121
|
127
|
sum_ent = 0
|
122
|
128
|
# 二叉树
|
123
|
129
|
p = Ytrain[Xtrain[:, i] > mean]
|
124
|
|
- sum_ent = sum_ent + calc_ent(p)*len(p)/len(Ytrain)
|
|
130
|
+ p0 = sum(weights[Xtrain[:, i] > mean])
|
|
131
|
+ sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] > mean])*p0
|
125
|
132
|
p = Ytrain[Xtrain[:, i] <= mean]
|
126
|
|
- sum_ent = sum_ent + calc_ent(p)*len(p)/len(Ytrain)
|
|
133
|
+ sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] <= mean])*(1-p0)
|
127
|
134
|
|
128
|
|
- if sum_ent < min_ent:
|
|
135
|
+ if sum_ent <= min_ent:
|
129
|
136
|
min_ent = sum_ent
|
130
|
137
|
min_i = i
|
131
|
138
|
min_mean = mean
|
132
|
139
|
return min_i, min_mean, min_ent
|
133
|
140
|
|
134
|
141
|
|
135
|
|
-def cal_max_ent_attr_c45(Xtrain, Ytrain):
|
|
142
|
+def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
|
136
|
143
|
max_ent = 0
|
137
|
144
|
max_mean = 0
|
138
|
145
|
h = calc_ent(Ytrain)
|
|
@@ -140,12 +147,17 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain):
|
140
|
147
|
left = Xtrain[:k + 1]
|
141
|
148
|
right = Xtrain[k + 1:]
|
142
|
149
|
|
143
|
|
- left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
|
144
|
|
- right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
|
|
150
|
+ if weights is None:
|
|
151
|
+ left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
|
|
152
|
+ right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
|
|
153
|
+
|
|
154
|
+ else:
|
|
155
|
+ pass
|
145
|
156
|
|
146
|
157
|
iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
|
147
|
158
|
iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
|
148
|
159
|
|
|
160
|
+
|
149
|
161
|
gain_ent = (h - left_ent - right_ent)/iv
|
150
|
162
|
|
151
|
163
|
if gain_ent > max_ent:
|
|
@@ -153,6 +165,8 @@ def cal_max_ent_attr_c45(Xtrain, Ytrain):
|
153
|
165
|
max_mean = left[-1]
|
154
|
166
|
return max_ent, max_mean
|
155
|
167
|
|
|
168
|
+# 样本权重
|
|
169
|
+weights = []
|
156
|
170
|
|
157
|
171
|
# 计算某个属性的信息增益率
|
158
|
172
|
def cal_ent_attr_c45(Xtrain, Ytrain):
|
|
@@ -174,7 +188,7 @@ def cal_ent_attr_c45(Xtrain, Ytrain):
|
174
|
188
|
|
175
|
189
|
# 计算某个属性的基尼指数
|
176
|
190
|
def cal_gini_attr(Xtrain, Ytrain):
|
177
|
|
- print('sharp', Xtrain.shape)
|
|
191
|
+ # print('sharp', Xtrain.shape)
|
178
|
192
|
|
179
|
193
|
# 对每个属性
|
180
|
194
|
min_ent = 100
|
|
@@ -196,7 +210,7 @@ def cal_gini_attr(Xtrain, Ytrain):
|
196
|
210
|
min_mean = mean
|
197
|
211
|
return min_i, min_mean, min_ent
|
198
|
212
|
|
199
|
|
-MAX_T = 5
|
|
213
|
+MAX_T = 1
|
200
|
214
|
|
201
|
215
|
|
202
|
216
|
def is_end(Ytrain):
|
|
@@ -206,22 +220,24 @@ def is_end(Ytrain):
|
206
|
220
|
return True
|
207
|
221
|
|
208
|
222
|
# 强行划分为叶子节点
|
209
|
|
-def leaf_node(Ytrain):
|
|
223
|
+def leaf_node(Ytrain, weights):
|
210
|
224
|
p_set = []
|
|
225
|
+ k = 0
|
211
|
226
|
for item in Ytrain:
|
212
|
227
|
for i in p_set:
|
213
|
228
|
if i[0] == item:
|
214
|
|
- i[1] = i[1] + 1
|
|
229
|
+ i[1] = i[1] + weights[k]
|
215
|
230
|
break
|
216
|
231
|
else:
|
217
|
|
- i = [item, 1]
|
|
232
|
+ i = [item, weights[k]]
|
218
|
233
|
p_set.append(i)
|
|
234
|
+ k = k + 1
|
219
|
235
|
|
220
|
236
|
max_item = [0, 0]
|
221
|
237
|
for item in p_set:
|
222
|
238
|
if item[1] > max_item[1]:
|
223
|
239
|
max_item = item
|
224
|
|
- print('这个是叶子节点,value:', max_item[0])
|
|
240
|
+ # print('这个是叶子节点,value:', max_item[0])
|
225
|
241
|
return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
|
226
|
242
|
|
227
|
243
|
|
|
@@ -235,18 +251,18 @@ def distrib(Ytrain):
|
235
|
251
|
return d_list
|
236
|
252
|
|
237
|
253
|
|
238
|
|
-def fit(Xtrain, Ytrain, parent_node, depth):
|
|
254
|
+def fit(Xtrain, Ytrain, parent_node, depth, weights):
|
239
|
255
|
|
240
|
256
|
if is_end(Ytrain):
|
241
|
|
- print('这个是叶子节点')
|
|
257
|
+ # print('这个是叶子节点')
|
242
|
258
|
return TreeNode(-1, 0, 0, True, -1, len(Ytrain), distrib(Ytrain))
|
243
|
259
|
|
244
|
260
|
if depth >= MAX_T:
|
245
|
|
- return leaf_node(Ytrain)
|
|
261
|
+ return leaf_node(Ytrain, weights)
|
246
|
262
|
|
247
|
|
- i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
|
|
263
|
+ i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
|
248
|
264
|
total_ent = calc_ent(Ytrain)
|
249
|
|
- print("第", i, "个属性,mean:", mean)
|
|
265
|
+ # print("第", i, "个属性,mean:", mean)
|
250
|
266
|
# 生成节点
|
251
|
267
|
parent_node = TreeNode(i, mean, total_ent - min_ent, False, -1, len(Ytrain), distrib(Ytrain))
|
252
|
268
|
|
|
@@ -255,12 +271,12 @@ def fit(Xtrain, Ytrain, parent_node, depth):
|
255
|
271
|
right_Xtrain = Xtrain[Xtrain[:, i] > mean]
|
256
|
272
|
# right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
|
257
|
273
|
|
258
|
|
- right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1)
|
|
274
|
+ right_node = fit(right_Xtrain, right_Ytrain, parent_node, depth+1, weights[Xtrain[:, i] > mean])
|
259
|
275
|
|
260
|
276
|
left_Ytrain = Ytrain[Xtrain[:, i] <= mean]
|
261
|
277
|
left_Xtrain = Xtrain[Xtrain[:, i] <= mean]
|
262
|
278
|
# left_Xtrain = np.delete(left_Xtrain, i, axis=1)
|
263
|
|
- left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1)
|
|
279
|
+ left_node = fit(left_Xtrain, left_Ytrain, parent_node, depth + 1, weights[Xtrain[:, i] <= mean])
|
264
|
280
|
|
265
|
281
|
parent_node.left = left_node
|
266
|
282
|
parent_node.right = right_node
|
|
@@ -286,29 +302,37 @@ def print_width(nodes, depth):
|
286
|
302
|
|
287
|
303
|
def predit_one(X, Y, node):
|
288
|
304
|
if node.is_leaf:
|
289
|
|
- print(class_names[node.y], class_names[Y])
|
|
305
|
+ # print(class_names[node.y], class_names[Y])
|
|
306
|
+ if node.y == 0:
|
|
307
|
+ return -1
|
|
308
|
+ return node.y
|
290
|
309
|
else:
|
291
|
310
|
if X[node.idx] <= node.idx_value:
|
292
|
|
- predit_one(X,Y,node.left)
|
|
311
|
+ return predit_one(X,Y,node.left)
|
293
|
312
|
else:
|
294
|
|
- predit_one(X, Y, node.right)
|
|
313
|
+ return predit_one(X, Y, node.right)
|
295
|
314
|
|
296
|
315
|
|
297
|
|
-def predit(Xtest, Ytest, node):
|
298
|
|
- for i in range(Xtest.shape[1]):
|
299
|
|
- predit_one(Xtest[i], Ytest[i], node)
|
|
316
|
+def predict(Xtest, Ytest, node):
|
|
317
|
+ result = []
|
|
318
|
+ for i in range(Xtest.shape[0]):
|
|
319
|
+ result.append(predit_one(Xtest[i], None, node))
|
|
320
|
+ return np.array(result)
|
300
|
321
|
|
301
|
322
|
|
302
|
323
|
if __name__ == '__main__':
|
303
|
324
|
Xtrain, Xtest, Ytrain, Ytest = read_data()
|
304
|
325
|
print(calc_ent1(Ytrain))
|
305
|
|
- print(calc_ent(Ytrain))
|
|
326
|
+
|
|
327
|
+ weights = np.ones(len(Ytrain))/Ytrain.shape[0]
|
|
328
|
+ print("熵值", calc_ent(Ytrain))
|
|
329
|
+ print("熵值", calc_ent(Ytrain, weights))
|
306
|
330
|
|
307
|
331
|
print("基尼指数", cal_gini(Ytrain))
|
308
|
332
|
|
309
|
333
|
print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
|
310
|
334
|
|
311
|
|
- node = fit(Xtrain, Ytrain, None, 0)
|
|
335
|
+ node = fit(Xtrain, Ytrain, None, 0, weights)
|
312
|
336
|
print_width([node], 1)
|
313
|
337
|
|
314
|
|
- predit(Xtest, Ytest, node)
|
|
338
|
+ print(predict(Xtest, Ytest, node))
|