#!/usr/bin/python # -*- coding: UTF-8 -*- from sklearn.datasets import load_wine,load_breast_cancer from sklearn.model_selection import train_test_split import numpy as np feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮', '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸' , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T'] class_names=["琴酒", "雪莉", "贝尔摩德"] # 生成决策树的节点类型 class TreeNode(object): idx = 0 # 属性 idx_value = 0.0 # 属性值 ent = 0.0 # 信息增益 is_leaf = False y = 0 # 预测值 samples = 0 # 样本数 value = [] # 分布情况 left = None right = None def __init__(self, idx, idx_value, ent, is_leaf, y, samples, value, left=None, right=None): self.idx = idx self.idx_value = idx_value self.is_leaf = is_leaf self.ent = ent self.y = y self.samples = samples self.value = value self.left = left self.right = right if self.y is None: self.y = np.where(value == np.max(value))[0][0] ## TODO # print(self.y, self.value) def __str__(self): if self.idx == -1: return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \ ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, class_names[self.y]) else: return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \ (feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, class_names[self.y]) def export(self, _feature_name, _class_names): if self.idx == -1: return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \ ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, _class_names[self.y]) else: return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \ (_feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, _class_names[self.y]) def read_data(): wine = load_wine() print("数组结构", wine.data.shape) # 178*13 print(wine.target) print(wine.feature_names) print(wine.target_names) # 如果wine是一张表,应该长这样: import pandas as pd # pdata = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)], axis=1) Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3) return Xtrain, Xtest, Ytrain, Ytest def calc_ent(x, weights=None): """ calculate shanno ent of x """ x_value_list = set([x[i] for i in range(x.shape[0])]) ent = 0.0 if weights is not None: weights = weights / sum(weights) for x_value in x_value_list: if weights is None: p = float(x[x == x_value].shape[0]) / x.shape[0] else: p = sum(sum([x == x_value]*weights)) logp = np.log2(p) ent -= p * logp return ent def cal_gini(x): x_value_list = set([x[i] for i in range(x.shape[0])]) ent = 0.0 for x_value in x_value_list: p = float(x[x == x_value].shape[0]) / x.shape[0] ent += p*p return 1-ent def calc_ent1(x): """ calculate shanno ent of x """ p_set = [] for item in x: for i in p_set: if i[0] == item: i[1] = i[1] + 1 break else: i = [item, 1] p_set.append(i) pro_list = [] size = len(x) for item in p_set: pro_list.append(item[1]/size) ent = 0.0 for p in pro_list: logp = np.log2(p) ent -= p * logp return ent # 计算某个属性的信息增益 def cal_ent_attr(Xtrain, Ytrain): # print('sharp', Xtrain.shape) # 对每个属性 min_ent = 100 min_i = 0 min_mean = 0 for i in range(Xtrain.shape[1]): x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])]) mean = sum(x_value_list)/len(x_value_list) sum_ent = 0 # 二叉树 p = Ytrain[Xtrain[:, i] > mean] p0 = len(p)/Ytrain.shape[0] sum_ent = sum_ent + calc_ent(p)*p0 p = Ytrain[Xtrain[:, i] <= mean] sum_ent = sum_ent + calc_ent(p)*(1-p0) if sum_ent <= min_ent: min_ent = sum_ent min_i = i min_mean = mean return min_i, min_mean, min_ent def cal_max_ent_attr_c45(Xtrain, Ytrain): max_ent = 0 max_mean = 0 h = calc_ent(Ytrain) p = 0 for k in range(0, len(Xtrain) - 1, 3): left = Xtrain[:k + 1] right = Xtrain[k + 1:] left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain) right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain) iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain)) iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain)) gain_ent = (h - left_ent - right_ent)/iv if gain_ent > max_ent: max_ent = gain_ent max_mean = left[-1] return max_ent, max_mean # 计算某个属性的信息增益率 def cal_ent_attr_c45(Xtrain, Ytrain): # 对每个属性 max_ent = 0 max_i = 0 max_mean = 0 for i in range(Xtrain.shape[1]): #每个属性 argsort = Xtrain[:,i].argsort() x,y = Xtrain[:,i][argsort], Ytrain[argsort] gain_ent, mean = cal_max_ent_attr_c45(x, y) if gain_ent > max_ent: max_ent = gain_ent max_i = i max_mean = mean return max_i, max_mean, max_ent # 计算某个属性的基尼指数 def cal_gini_attr(Xtrain, Ytrain): # print('sharp', Xtrain.shape) # 对每个属性 min_ent = 100 min_i = 0 min_mean = 0 for i in range(Xtrain.shape[1]): x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])]) mean = sum(x_value_list)/len(x_value_list) sum_ent = 0 # 二叉树 p = Ytrain[Xtrain[:, i] > mean] sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain) p = Ytrain[Xtrain[:, i] <= mean] sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain) if sum_ent < min_ent: min_ent = sum_ent min_i = i min_mean = mean return min_i, min_mean, min_ent def is_end(Ytrain): if len(Ytrain) == 0: return True if len(set(Ytrain)) == 1: # 只有一个分类 return True # 强行划分为叶子节点 def leaf_node(Ytrain): p_set = [] k = 0 for item in Ytrain: for i in p_set: if i[0] == item: i[1] = i[1] + 1 break else: i = [item, 1] p_set.append(i) k = k + 1 max_item = [0, 0] for item in p_set: if item[1] > max_item[1]: max_item = item # print('这个是叶子节点,value:', max_item[0]) return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain)) def distrib(Ytrain): x_value_list = set([Ytrain[i] for i in range(Ytrain.shape[0])]) ent = 0.0 d_list = np.zeros(3, dtype=int) for x_value in x_value_list: d_list[x_value] = len([1 for i in Ytrain == x_value if i]) return d_list def print_width(nodes, depth, _feature_name, _class_names): if len(nodes) == 0: return print("--第", depth, "层--") node_down = [] for node in nodes: print(node.export(_feature_name, _class_names)) if node.left is not None: node_down.append(node.left) if node.right is not None: node_down.append(node.right) print_width(node_down, depth+1, _feature_name, _class_names) def predit_one(X, Y, node): if node.is_leaf: # print(class_names[node.y], class_names[Y]) # if node.y == 0: # return -1 return node.y else: if X[node.idx] <= node.idx_value: return predit_one(X,Y,node.left) else: return predit_one(X, Y, node.right) class MyDT(object): criterion = None max_depth = None root_node = None def __init__(self, criterion, max_depth, max_features=1): self.criterion = criterion self.max_depth = max_depth def fit(self, Xtrain, Ytrain, sample_weight=None): if sample_weight is not None: indices = [i for i in np.random.choice(Xtrain.shape[0], Ytrain.shape[0], p=sample_weight)] Xtrain = Xtrain[indices] Ytrain = Ytrain[indices] self.root_node = self.do_fit(Xtrain, Ytrain, 0) def do_fit(self, Xtrain, Ytrain, depth): if is_end(Ytrain): # print('这个是叶子节点') return leaf_node(Ytrain) if depth >= self.max_depth: return leaf_node(Ytrain) if self.criterion == 'entropy': i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain) elif self.criterion == 'C4.5': i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain) else: i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain) total_ent = 0 # calc_ent(Ytrain) # print("第", i, "个属性,mean:", mean) # 生成节点 parent_node = TreeNode(i, mean, total_ent - min_ent, False, None, len(Ytrain), distrib(Ytrain)) # 切分数据 right_position = Xtrain[:, i] > mean right_Ytrain = Ytrain[right_position] right_Xtrain = Xtrain[right_position] # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分 right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1) left_position = Xtrain[:, i] <= mean left_Ytrain = Ytrain[left_position] left_Xtrain = Xtrain[left_position] # left_Xtrain = np.delete(left_Xtrain, i, axis=1) left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1) parent_node.left = left_node parent_node.right = right_node return parent_node def predict(self, Xtest): result = [] for i in range(Xtest.shape[0]): result.append(predit_one(Xtest[i], None, self.root_node)) return np.array(result) def score(self, Xtest, Ytest): result = self.predict(Xtest) return sum(result == Ytest)/Ytest.shape[0] @staticmethod def export(tree, feature_names, class_names): nodes = tree.root_node print_width([nodes], 1, feature_names, class_names) def read_data_1(): wine = load_breast_cancer() Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3) for i in range(len(Ytrain)): if Ytrain[i] == 0: Ytrain[i] = -1 for i in range(len(Ytest)): if Ytest[i] == 0: Ytest[i] = -1 return Xtrain, Xtest, Ytrain, Ytest if __name__ == '__main__': Xtrain, Xtest, Ytrain, Ytest = read_data_1() print(calc_ent1(Ytrain)) weights = np.ones(len(Ytrain))/Ytrain.shape[0] print("熵值", calc_ent(Ytrain)) print("熵值", calc_ent(Ytrain)) print("基尼指数", cal_gini(Ytrain)) print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain)) clf = MyDT(criterion="C4.5", max_depth=1,) clf.fit(Xtrain, Ytrain, weights) # print_width([node], 1) print(clf.predict(Xtest)) print("测试集", clf.score(Xtest, Ytest)) print("训练集", clf.score(Xtrain, Ytrain)) MyDT.export(clf, feature_name, class_names)