my_tree.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. from sklearn.datasets import load_wine,load_breast_cancer
  4. from sklearn.model_selection import train_test_split
  5. import numpy as np
  6. feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
  7. '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸'
  8. , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
  9. class_names=["琴酒", "雪莉", "贝尔摩德"]
  10. # 生成决策树的节点类型
  11. class TreeNode(object):
  12. idx = 0 # 属性
  13. idx_value = 0.0 # 属性值
  14. ent = 0.0 # 信息增益
  15. is_leaf = False
  16. y = 0 # 预测值
  17. samples = 0 # 样本数
  18. value = [] # 分布情况
  19. left = None
  20. right = None
  21. def __init__(self, idx, idx_value, ent, is_leaf, y, samples, value, left=None, right=None):
  22. self.idx = idx
  23. self.idx_value = idx_value
  24. self.is_leaf = is_leaf
  25. self.ent = ent
  26. self.y = y
  27. self.samples = samples
  28. self.value = value
  29. self.left = left
  30. self.right = right
  31. if self.y is None:
  32. self.y = np.where(value == np.max(value))[0][0] ## TODO
  33. # print(self.y, self.value)
  34. def __str__(self):
  35. if self.idx == -1:
  36. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  37. ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, class_names[self.y])
  38. else:
  39. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  40. (feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, class_names[self.y])
  41. def export(self, _feature_name, _class_names):
  42. if self.idx == -1:
  43. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  44. ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, _class_names[self.y])
  45. else:
  46. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  47. (_feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, _class_names[self.y])
  48. def read_data():
  49. wine = load_wine()
  50. print("数组结构", wine.data.shape) # 178*13
  51. print(wine.target)
  52. print(wine.feature_names)
  53. print(wine.target_names)
  54. # 如果wine是一张表,应该长这样:
  55. import pandas as pd
  56. # pdata = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)], axis=1)
  57. Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
  58. return Xtrain, Xtest, Ytrain, Ytest
  59. def calc_ent(x, weights=None):
  60. """
  61. calculate shanno ent of x
  62. """
  63. x_value_list = set([x[i] for i in range(x.shape[0])])
  64. ent = 0.0
  65. if weights is not None:
  66. weights = weights / sum(weights)
  67. for x_value in x_value_list:
  68. if weights is None:
  69. p = float(x[x == x_value].shape[0]) / x.shape[0]
  70. else:
  71. p = sum(sum([x == x_value]*weights))
  72. logp = np.log2(p)
  73. ent -= p * logp
  74. return ent
  75. def cal_gini(x):
  76. x_value_list = set([x[i] for i in range(x.shape[0])])
  77. ent = 0.0
  78. for x_value in x_value_list:
  79. p = float(x[x == x_value].shape[0]) / x.shape[0]
  80. ent += p*p
  81. return 1-ent
  82. def calc_ent1(x):
  83. """
  84. calculate shanno ent of x
  85. """
  86. p_set = []
  87. for item in x:
  88. for i in p_set:
  89. if i[0] == item:
  90. i[1] = i[1] + 1
  91. break
  92. else:
  93. i = [item, 1]
  94. p_set.append(i)
  95. pro_list = []
  96. size = len(x)
  97. for item in p_set:
  98. pro_list.append(item[1]/size)
  99. ent = 0.0
  100. for p in pro_list:
  101. logp = np.log2(p)
  102. ent -= p * logp
  103. return ent
  104. # 计算某个属性的信息增益
  105. def cal_ent_attr(Xtrain, Ytrain):
  106. # print('sharp', Xtrain.shape)
  107. # 对每个属性
  108. min_ent = 100
  109. min_i = 0
  110. min_mean = 0
  111. for i in range(Xtrain.shape[1]):
  112. x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
  113. mean = sum(x_value_list)/len(x_value_list)
  114. sum_ent = 0
  115. # 二叉树
  116. p = Ytrain[Xtrain[:, i] > mean]
  117. p0 = len(p)/Ytrain.shape[0]
  118. sum_ent = sum_ent + calc_ent(p)*p0
  119. p = Ytrain[Xtrain[:, i] <= mean]
  120. sum_ent = sum_ent + calc_ent(p)*(1-p0)
  121. if sum_ent <= min_ent:
  122. min_ent = sum_ent
  123. min_i = i
  124. min_mean = mean
  125. return min_i, min_mean, min_ent
  126. def cal_max_ent_attr_c45(Xtrain, Ytrain):
  127. max_ent = 0
  128. max_mean = 0
  129. h = calc_ent(Ytrain)
  130. p = 0
  131. for k in range(0, len(Xtrain) - 1, 3):
  132. left = Xtrain[:k + 1]
  133. right = Xtrain[k + 1:]
  134. left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
  135. right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
  136. iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
  137. iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
  138. gain_ent = (h - left_ent - right_ent)/iv
  139. if gain_ent > max_ent:
  140. max_ent = gain_ent
  141. max_mean = left[-1]
  142. return max_ent, max_mean
  143. # 计算某个属性的信息增益率
  144. def cal_ent_attr_c45(Xtrain, Ytrain):
  145. # 对每个属性
  146. max_ent = 0
  147. max_i = 0
  148. max_mean = 0
  149. for i in range(Xtrain.shape[1]): #每个属性
  150. argsort = Xtrain[:,i].argsort()
  151. x,y = Xtrain[:,i][argsort], Ytrain[argsort]
  152. gain_ent, mean = cal_max_ent_attr_c45(x, y)
  153. if gain_ent > max_ent:
  154. max_ent = gain_ent
  155. max_i = i
  156. max_mean = mean
  157. return max_i, max_mean, max_ent
  158. # 计算某个属性的基尼指数
  159. def cal_gini_attr(Xtrain, Ytrain):
  160. # print('sharp', Xtrain.shape)
  161. # 对每个属性
  162. min_ent = 100
  163. min_i = 0
  164. min_mean = 0
  165. for i in range(Xtrain.shape[1]):
  166. x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
  167. mean = sum(x_value_list)/len(x_value_list)
  168. sum_ent = 0
  169. # 二叉树
  170. p = Ytrain[Xtrain[:, i] > mean]
  171. sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain)
  172. p = Ytrain[Xtrain[:, i] <= mean]
  173. sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain)
  174. if sum_ent < min_ent:
  175. min_ent = sum_ent
  176. min_i = i
  177. min_mean = mean
  178. return min_i, min_mean, min_ent
  179. def is_end(Ytrain):
  180. if len(Ytrain) == 0:
  181. return True
  182. if len(set(Ytrain)) == 1: # 只有一个分类
  183. return True
  184. # 强行划分为叶子节点
  185. def leaf_node(Ytrain):
  186. p_set = []
  187. k = 0
  188. for item in Ytrain:
  189. for i in p_set:
  190. if i[0] == item:
  191. i[1] = i[1] + 1
  192. break
  193. else:
  194. i = [item, 1]
  195. p_set.append(i)
  196. k = k + 1
  197. max_item = [0, 0]
  198. for item in p_set:
  199. if item[1] > max_item[1]:
  200. max_item = item
  201. # print('这个是叶子节点,value:', max_item[0])
  202. return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
  203. def distrib(Ytrain):
  204. x_value_list = set([Ytrain[i] for i in range(Ytrain.shape[0])])
  205. ent = 0.0
  206. d_list = np.zeros(3, dtype=int)
  207. for x_value in x_value_list:
  208. d_list[x_value] = len([1 for i in Ytrain == x_value if i])
  209. return d_list
  210. def print_width(nodes, depth, _feature_name, _class_names):
  211. if len(nodes) == 0:
  212. return
  213. print("--第", depth, "层--")
  214. node_down = []
  215. for node in nodes:
  216. print(node.export(_feature_name, _class_names))
  217. if node.left is not None:
  218. node_down.append(node.left)
  219. if node.right is not None:
  220. node_down.append(node.right)
  221. print_width(node_down, depth+1, _feature_name, _class_names)
  222. def predit_one(X, Y, node):
  223. if node.is_leaf:
  224. # print(class_names[node.y], class_names[Y])
  225. # if node.y == 0:
  226. # return -1
  227. return node.y
  228. else:
  229. if X[node.idx] <= node.idx_value:
  230. return predit_one(X,Y,node.left)
  231. else:
  232. return predit_one(X, Y, node.right)
  233. class MyDT(object):
  234. criterion = None
  235. max_depth = None
  236. root_node = None
  237. def __init__(self, criterion, max_depth, max_features=1):
  238. self.criterion = criterion
  239. self.max_depth = max_depth
  240. def fit(self, Xtrain, Ytrain, sample_weight=None):
  241. if sample_weight is not None:
  242. indices = [i for i in np.random.choice(Xtrain.shape[0], Ytrain.shape[0], p=sample_weight)]
  243. Xtrain = Xtrain[indices]
  244. Ytrain = Ytrain[indices]
  245. self.root_node = self.do_fit(Xtrain, Ytrain, 0)
  246. def do_fit(self, Xtrain, Ytrain, depth):
  247. if is_end(Ytrain):
  248. # print('这个是叶子节点')
  249. return leaf_node(Ytrain)
  250. if depth >= self.max_depth:
  251. return leaf_node(Ytrain)
  252. if self.criterion == 'entropy':
  253. i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain)
  254. elif self.criterion == 'C4.5':
  255. i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain)
  256. else:
  257. i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain)
  258. total_ent = 0 # calc_ent(Ytrain)
  259. # print("第", i, "个属性,mean:", mean)
  260. # 生成节点
  261. parent_node = TreeNode(i, mean, total_ent - min_ent, False, None, len(Ytrain), distrib(Ytrain))
  262. # 切分数据
  263. right_position = Xtrain[:, i] > mean
  264. right_Ytrain = Ytrain[right_position]
  265. right_Xtrain = Xtrain[right_position]
  266. # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
  267. right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1)
  268. left_position = Xtrain[:, i] <= mean
  269. left_Ytrain = Ytrain[left_position]
  270. left_Xtrain = Xtrain[left_position]
  271. # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
  272. left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1)
  273. parent_node.left = left_node
  274. parent_node.right = right_node
  275. return parent_node
  276. def predict(self, Xtest):
  277. result = []
  278. for i in range(Xtest.shape[0]):
  279. result.append(predit_one(Xtest[i], None, self.root_node))
  280. return np.array(result)
  281. def score(self, Xtest, Ytest):
  282. result = self.predict(Xtest)
  283. return sum(result == Ytest)/Ytest.shape[0]
  284. @staticmethod
  285. def export(tree, feature_names, class_names):
  286. nodes = tree.root_node
  287. print_width([nodes], 1, feature_names, class_names)
  288. def read_data_1():
  289. wine = load_breast_cancer()
  290. Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
  291. for i in range(len(Ytrain)):
  292. if Ytrain[i] == 0:
  293. Ytrain[i] = -1
  294. for i in range(len(Ytest)):
  295. if Ytest[i] == 0:
  296. Ytest[i] = -1
  297. return Xtrain, Xtest, Ytrain, Ytest
  298. if __name__ == '__main__':
  299. Xtrain, Xtest, Ytrain, Ytest = read_data_1()
  300. print(calc_ent1(Ytrain))
  301. weights = np.ones(len(Ytrain))/Ytrain.shape[0]
  302. print("熵值", calc_ent(Ytrain))
  303. print("熵值", calc_ent(Ytrain))
  304. print("基尼指数", cal_gini(Ytrain))
  305. print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain))
  306. clf = MyDT(criterion="C4.5", max_depth=1,)
  307. clf.fit(Xtrain, Ytrain, weights)
  308. # print_width([node], 1)
  309. print(clf.predict(Xtest))
  310. print("测试集", clf.score(Xtest, Ytest))
  311. print("训练集", clf.score(Xtrain, Ytrain))
  312. MyDT.export(clf, feature_name, class_names)