123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- #!/usr/bin/python
- # -*- coding: UTF-8 -*-
- from sklearn.datasets import load_wine,load_breast_cancer
- from sklearn.model_selection import train_test_split
- import numpy as np
- feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
- '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸'
- , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
- class_names=["琴酒", "雪莉", "贝尔摩德"]
- # 生成决策树的节点类型
- class TreeNode(object):
- idx = 0 # 属性
- idx_value = 0.0 # 属性值
- ent = 0.0 # 信息增益
- is_leaf = False
- y = 0 # 预测值
- samples = 0 # 样本数
- value = [] # 分布情况
- left = None
- right = None
- def __init__(self, idx, idx_value, ent, is_leaf, y, samples, value, left=None, right=None):
- self.idx = idx
- self.idx_value = idx_value
- self.is_leaf = is_leaf
- self.ent = ent
- self.y = y
- self.samples = samples
- self.value = value
- self.left = left
- self.right = right
- if self.y is None:
- self.y = np.where(value == np.max(value))[0][0] ## TODO
- # print(self.y, self.value)
- def __str__(self):
- if self.idx == -1:
- return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
- ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, class_names[self.y])
- else:
- return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
- (feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, class_names[self.y])
- def export(self, _feature_name, _class_names):
- if self.idx == -1:
- return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
- ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, _class_names[self.y])
- else:
- return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
- (_feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, _class_names[self.y])
- def read_data():
- wine = load_wine()
- print("数组结构", wine.data.shape) # 178*13
- print(wine.target)
- print(wine.feature_names)
- print(wine.target_names)
- # 如果wine是一张表,应该长这样:
- import pandas as pd
- # pdata = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)], axis=1)
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
- return Xtrain, Xtest, Ytrain, Ytest
- def calc_ent(x, weights=None):
- """
- calculate shanno ent of x
- """
- x_value_list = set([x[i] for i in range(x.shape[0])])
- ent = 0.0
- if weights is not None:
- weights = weights / sum(weights)
- for x_value in x_value_list:
- if weights is None:
- p = float(x[x == x_value].shape[0]) / x.shape[0]
- else:
- p = sum(sum([x == x_value]*weights))
- logp = np.log2(p)
- ent -= p * logp
- return ent
- def cal_gini(x):
- x_value_list = set([x[i] for i in range(x.shape[0])])
- ent = 0.0
- for x_value in x_value_list:
- p = float(x[x == x_value].shape[0]) / x.shape[0]
- ent += p*p
- return 1-ent
- def calc_ent1(x):
- """
- calculate shanno ent of x
- """
- p_set = []
- for item in x:
- for i in p_set:
- if i[0] == item:
- i[1] = i[1] + 1
- break
- else:
- i = [item, 1]
- p_set.append(i)
- pro_list = []
- size = len(x)
- for item in p_set:
- pro_list.append(item[1]/size)
- ent = 0.0
- for p in pro_list:
- logp = np.log2(p)
- ent -= p * logp
- return ent
- # 计算某个属性的信息增益
- def cal_ent_attr(Xtrain, Ytrain):
- # print('sharp', Xtrain.shape)
- # 对每个属性
- min_ent = 100
- min_i = 0
- min_mean = 0
- for i in range(Xtrain.shape[1]):
- x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
- mean = sum(x_value_list)/len(x_value_list)
- sum_ent = 0
- # 二叉树
- p = Ytrain[Xtrain[:, i] > mean]
- p0 = len(p)/Ytrain.shape[0]
- sum_ent = sum_ent + calc_ent(p)*p0
- p = Ytrain[Xtrain[:, i] <= mean]
- sum_ent = sum_ent + calc_ent(p)*(1-p0)
- if sum_ent <= min_ent:
- min_ent = sum_ent
- min_i = i
- min_mean = mean
- return min_i, min_mean, min_ent
- def cal_max_ent_attr_c45(Xtrain, Ytrain):
- max_ent = 0
- max_mean = 0
- h = calc_ent(Ytrain)
- p = 0
- for k in range(0, len(Xtrain) - 1, 3):
- left = Xtrain[:k + 1]
- right = Xtrain[k + 1:]
- left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
- right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
- iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
- iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
- gain_ent = (h - left_ent - right_ent)/iv
- if gain_ent > max_ent:
- max_ent = gain_ent
- max_mean = left[-1]
- return max_ent, max_mean
- # 计算某个属性的信息增益率
- def cal_ent_attr_c45(Xtrain, Ytrain):
- # 对每个属性
- max_ent = 0
- max_i = 0
- max_mean = 0
- for i in range(Xtrain.shape[1]): #每个属性
- argsort = Xtrain[:,i].argsort()
- x,y = Xtrain[:,i][argsort], Ytrain[argsort]
- gain_ent, mean = cal_max_ent_attr_c45(x, y)
- if gain_ent > max_ent:
- max_ent = gain_ent
- max_i = i
- max_mean = mean
- return max_i, max_mean, max_ent
- # 计算某个属性的基尼指数
- def cal_gini_attr(Xtrain, Ytrain):
- # print('sharp', Xtrain.shape)
- # 对每个属性
- min_ent = 100
- min_i = 0
- min_mean = 0
- for i in range(Xtrain.shape[1]):
- x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
- mean = sum(x_value_list)/len(x_value_list)
- sum_ent = 0
- # 二叉树
- p = Ytrain[Xtrain[:, i] > mean]
- sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain)
- p = Ytrain[Xtrain[:, i] <= mean]
- sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain)
- if sum_ent < min_ent:
- min_ent = sum_ent
- min_i = i
- min_mean = mean
- return min_i, min_mean, min_ent
- def is_end(Ytrain):
- if len(Ytrain) == 0:
- return True
- if len(set(Ytrain)) == 1: # 只有一个分类
- return True
- # 强行划分为叶子节点
- def leaf_node(Ytrain):
- p_set = []
- k = 0
- for item in Ytrain:
- for i in p_set:
- if i[0] == item:
- i[1] = i[1] + 1
- break
- else:
- i = [item, 1]
- p_set.append(i)
- k = k + 1
- max_item = [0, 0]
- for item in p_set:
- if item[1] > max_item[1]:
- max_item = item
- # print('这个是叶子节点,value:', max_item[0])
- return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
- def distrib(Ytrain):
- x_value_list = set([Ytrain[i] for i in range(Ytrain.shape[0])])
- ent = 0.0
- d_list = np.zeros(3, dtype=int)
- for x_value in x_value_list:
- d_list[x_value] = len([1 for i in Ytrain == x_value if i])
- return d_list
- def print_width(nodes, depth, _feature_name, _class_names):
- if len(nodes) == 0:
- return
- print("--第", depth, "层--")
- node_down = []
- for node in nodes:
- print(node.export(_feature_name, _class_names))
- if node.left is not None:
- node_down.append(node.left)
- if node.right is not None:
- node_down.append(node.right)
- print_width(node_down, depth+1, _feature_name, _class_names)
- def predit_one(X, Y, node):
- if node.is_leaf:
- # print(class_names[node.y], class_names[Y])
- # if node.y == 0:
- # return -1
- return node.y
- else:
- if X[node.idx] <= node.idx_value:
- return predit_one(X,Y,node.left)
- else:
- return predit_one(X, Y, node.right)
- class MyDT(object):
- criterion = None
- max_depth = None
- root_node = None
- def __init__(self, criterion, max_depth, max_features=1):
- self.criterion = criterion
- self.max_depth = max_depth
- def fit(self, Xtrain, Ytrain, sample_weight=None):
- if sample_weight is not None:
- indices = [i for i in np.random.choice(Xtrain.shape[0], Ytrain.shape[0], p=sample_weight)]
- Xtrain = Xtrain[indices]
- Ytrain = Ytrain[indices]
- self.root_node = self.do_fit(Xtrain, Ytrain, 0)
- def do_fit(self, Xtrain, Ytrain, depth):
- if is_end(Ytrain):
- # print('这个是叶子节点')
- return leaf_node(Ytrain)
- if depth >= self.max_depth:
- return leaf_node(Ytrain)
- if self.criterion == 'entropy':
- i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain)
- elif self.criterion == 'C4.5':
- i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
- else:
- i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain)
- total_ent = 0 # calc_ent(Ytrain)
- # print("第", i, "个属性,mean:", mean)
- # 生成节点
- parent_node = TreeNode(i, mean, total_ent - min_ent, False, None, len(Ytrain), distrib(Ytrain))
- # 切分数据
- right_position = Xtrain[:, i] > mean
- right_Ytrain = Ytrain[right_position]
- right_Xtrain = Xtrain[right_position]
- # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
- right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1)
- left_position = Xtrain[:, i] <= mean
- left_Ytrain = Ytrain[left_position]
- left_Xtrain = Xtrain[left_position]
- # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
- left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1)
- parent_node.left = left_node
- parent_node.right = right_node
- return parent_node
- def predict(self, Xtest):
- result = []
- for i in range(Xtest.shape[0]):
- result.append(predit_one(Xtest[i], None, self.root_node))
- return np.array(result)
- def score(self, Xtest, Ytest):
- result = self.predict(Xtest)
- return sum(result == Ytest)/Ytest.shape[0]
- @staticmethod
- def export(tree, feature_names, class_names):
- nodes = tree.root_node
- print_width([nodes], 1, feature_names, class_names)
- def read_data_1():
- wine = load_breast_cancer()
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
- for i in range(len(Ytrain)):
- if Ytrain[i] == 0:
- Ytrain[i] = -1
- for i in range(len(Ytest)):
- if Ytest[i] == 0:
- Ytest[i] = -1
- return Xtrain, Xtest, Ytrain, Ytest
- if __name__ == '__main__':
- Xtrain, Xtest, Ytrain, Ytest = read_data_1()
- print(calc_ent1(Ytrain))
- weights = np.ones(len(Ytrain))/Ytrain.shape[0]
- print("熵值", calc_ent(Ytrain))
- print("熵值", calc_ent(Ytrain))
- print("基尼指数", cal_gini(Ytrain))
- print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
- clf = MyDT(criterion="C4.5", max_depth=1,)
- clf.fit(Xtrain, Ytrain, weights)
- # print_width([node], 1)
- print(clf.predict(Xtest))
- print("测试集", clf.score(Xtest, Ytest))
- print("训练集", clf.score(Xtrain, Ytrain))
- MyDT.export(clf, feature_name, class_names)
|