my_tree.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. from sklearn.datasets import load_wine
  4. from sklearn.model_selection import train_test_split
  5. import numpy as np
  6. feature_name = ['酒精', '苹果酸', '灰', '灰的碱性', '镁', '总酚', '类黄酮',
  7. '非黄烷类酚类', '花青素', '颜色强度', '色调', 'od280/od315稀释葡萄酒', '脯氨酸'
  8. , 'A', 'B', 'c', 'D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
  9. class_names=["琴酒", "雪莉", "贝尔摩德"]
  10. # 生成决策树的节点类型
  11. class TreeNode(object):
  12. idx = 0 # 属性
  13. idx_value = 0.0 # 属性值
  14. ent = 0.0 # 信息增益
  15. is_leaf = False
  16. y = 0 # 预测值
  17. samples = 0 # 样本数
  18. value = [] # 分布情况
  19. left = None
  20. right = None
  21. def __init__(self, idx, idx_value, ent, is_leaf, y, samples, value, left=None, right=None):
  22. self.idx = idx
  23. self.idx_value = idx_value
  24. self.is_leaf = is_leaf
  25. self.ent = ent
  26. self.y = y
  27. self.samples = samples
  28. self.value = value
  29. self.left = left
  30. self.right = right
  31. if self.y is None:
  32. self.y = np.where(value == np.max(value))[0][0] ## TODO
  33. # print(self.y, self.value)
  34. def __str__(self):
  35. if self.idx == -1:
  36. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  37. ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, class_names[self.y])
  38. else:
  39. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  40. (feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, class_names[self.y])
  41. def export(self, _feature_name, _class_names):
  42. if self.idx == -1:
  43. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  44. ('', self.idx_value, self.ent, self.is_leaf,self.samples, self.value, _class_names[self.y])
  45. else:
  46. return "%s,%f, ent:%f, leaf:%d, samples:%d, value:%s, %s" % \
  47. (_feature_name[self.idx], self.idx_value, self.ent, self.is_leaf, self.samples, self.value, _class_names[self.y])
  48. def read_data():
  49. wine = load_wine()
  50. print("数组结构", wine.data.shape) # 178*13
  51. print(wine.target)
  52. print(wine.feature_names)
  53. print(wine.target_names)
  54. # 如果wine是一张表,应该长这样:
  55. import pandas as pd
  56. # pdata = pd.concat([pd.DataFrame(wine.data), pd.DataFrame(wine.target)], axis=1)
  57. Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
  58. return Xtrain, Xtest, Ytrain, Ytest
  59. def calc_ent(x, weights=None):
  60. """
  61. calculate shanno ent of x
  62. """
  63. x_value_list = set([x[i] for i in range(x.shape[0])])
  64. ent = 0.0
  65. if weights is not None:
  66. weights = weights / sum(weights)
  67. for x_value in x_value_list:
  68. if weights is None:
  69. p = float(x[x == x_value].shape[0]) / x.shape[0]
  70. else:
  71. p = sum(sum([x == x_value]*weights))
  72. logp = np.log2(p)
  73. ent -= p * logp
  74. return ent
  75. def cal_gini(x):
  76. x_value_list = set([x[i] for i in range(x.shape[0])])
  77. ent = 0.0
  78. for x_value in x_value_list:
  79. p = float(x[x == x_value].shape[0]) / x.shape[0]
  80. ent += p*p
  81. return 1-ent
  82. def calc_ent1(x):
  83. """
  84. calculate shanno ent of x
  85. """
  86. p_set = []
  87. for item in x:
  88. for i in p_set:
  89. if i[0] == item:
  90. i[1] = i[1] + 1
  91. break
  92. else:
  93. i = [item, 1]
  94. p_set.append(i)
  95. pro_list = []
  96. size = len(x)
  97. for item in p_set:
  98. pro_list.append(item[1]/size)
  99. ent = 0.0
  100. for p in pro_list:
  101. logp = np.log2(p)
  102. ent -= p * logp
  103. return ent
  104. # 计算某个属性的信息增益
  105. def cal_ent_attr(Xtrain, Ytrain, weights):
  106. # print('sharp', Xtrain.shape)
  107. weights = weights / sum(weights)
  108. # 对每个属性
  109. min_ent = 100
  110. min_i = 0
  111. min_mean = 0
  112. for i in range(Xtrain.shape[1]):
  113. x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
  114. mean = sum(x_value_list)/len(x_value_list)
  115. sum_ent = 0
  116. # 二叉树
  117. p = Ytrain[Xtrain[:, i] > mean]
  118. p0 = sum(weights[Xtrain[:, i] > mean])
  119. sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] > mean])*p0
  120. p = Ytrain[Xtrain[:, i] <= mean]
  121. sum_ent = sum_ent + calc_ent(p, weights[Xtrain[:, i] <= mean])*(1-p0)
  122. if sum_ent <= min_ent:
  123. min_ent = sum_ent
  124. min_i = i
  125. min_mean = mean
  126. return min_i, min_mean, min_ent
  127. def cal_max_ent_attr_c45(Xtrain, Ytrain, weights=None):
  128. max_ent = 0
  129. max_mean = 0
  130. weights = weights / sum(weights)
  131. h = calc_ent(Ytrain, weights)
  132. p = 0
  133. for k in range(0, len(Xtrain) - 1, 3):
  134. left = Xtrain[:k + 1]
  135. right = Xtrain[k + 1:]
  136. if weights is None:
  137. left_ent = calc_ent(Ytrain[:k+1])*len(left)/len(Ytrain)
  138. right_ent = calc_ent(Ytrain[k + 1:])*len(right)/len(Ytrain)
  139. iv = -len(left) / len(Ytrain) * np.log2(len(left) / len(Ytrain))
  140. iv -= len(right) / len(Ytrain) * np.log2(len(right) / len(Ytrain))
  141. else:
  142. p += weights[k]
  143. left_ent = calc_ent(Ytrain[:k + 1], weights[:k+1]) * p
  144. right_ent = calc_ent(Ytrain[k + 1:], weights[k+1:]) * (1-p)
  145. iv = -p * np.log2(p)
  146. iv -= (1-p) * np.log2(1-p)
  147. gain_ent = (h - left_ent - right_ent)/iv
  148. if gain_ent > max_ent:
  149. max_ent = gain_ent
  150. max_mean = left[-1]
  151. return max_ent, max_mean
  152. # 样本权重
  153. weights = []
  154. # 计算某个属性的信息增益率
  155. def cal_ent_attr_c45(Xtrain, Ytrain, weights):
  156. # 对每个属性
  157. max_ent = 0
  158. max_i = 0
  159. max_mean = 0
  160. weights = weights / sum(weights)
  161. for i in range(Xtrain.shape[1]): #每个属性
  162. argsort = Xtrain[:,i].argsort()
  163. x,y,w = Xtrain[:,i][argsort], Ytrain[argsort], weights[argsort]
  164. gain_ent, mean = cal_max_ent_attr_c45(x, y, w)
  165. if gain_ent > max_ent:
  166. max_ent = gain_ent
  167. max_i = i
  168. max_mean = mean
  169. return max_i, max_mean, max_ent
  170. # 计算某个属性的基尼指数
  171. def cal_gini_attr(Xtrain, Ytrain):
  172. # print('sharp', Xtrain.shape)
  173. # 对每个属性
  174. min_ent = 100
  175. min_i = 0
  176. min_mean = 0
  177. for i in range(Xtrain.shape[1]):
  178. x_value_list = set([Xtrain[j][i] for j in range(Xtrain.shape[0])])
  179. mean = sum(x_value_list)/len(x_value_list)
  180. sum_ent = 0
  181. # 二叉树
  182. p = Ytrain[Xtrain[:, i] > mean]
  183. sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain)
  184. p = Ytrain[Xtrain[:, i] <= mean]
  185. sum_ent = sum_ent + cal_gini(p)*len(p)/len(Ytrain)
  186. if sum_ent < min_ent:
  187. min_ent = sum_ent
  188. min_i = i
  189. min_mean = mean
  190. return min_i, min_mean, min_ent
  191. MAX_T = 1
  192. def is_end(Ytrain):
  193. if len(Ytrain) == 0:
  194. return True
  195. if len(set(Ytrain)) == 1: # 只有一个分类
  196. return True
  197. # 强行划分为叶子节点
  198. def leaf_node(Ytrain, weights):
  199. p_set = []
  200. k = 0
  201. for item in Ytrain:
  202. for i in p_set:
  203. if i[0] == item:
  204. i[1] = i[1] + weights[k]
  205. break
  206. else:
  207. i = [item, weights[k]]
  208. p_set.append(i)
  209. k = k + 1
  210. max_item = [0, 0]
  211. for item in p_set:
  212. if item[1] > max_item[1]:
  213. max_item = item
  214. # print('这个是叶子节点,value:', max_item[0])
  215. return TreeNode(-1, 0, 0, True, max_item[0], len(Ytrain), distrib(Ytrain))
  216. def distrib(Ytrain):
  217. x_value_list = set([Ytrain[i] for i in range(Ytrain.shape[0])])
  218. ent = 0.0
  219. d_list = np.zeros(3, dtype=int)
  220. for x_value in x_value_list:
  221. d_list[x_value] = len([1 for i in Ytrain == x_value if i])
  222. return d_list
  223. def print_width(nodes, depth, _feature_name, _class_names):
  224. if len(nodes) == 0:
  225. return
  226. print("--第", depth, "层--")
  227. node_down = []
  228. for node in nodes:
  229. print(node.export(_feature_name, _class_names))
  230. if node.left is not None:
  231. node_down.append(node.left)
  232. if node.right is not None:
  233. node_down.append(node.right)
  234. print_width(node_down, depth+1, _feature_name, _class_names)
  235. def predit_one(X, Y, node):
  236. if node.is_leaf:
  237. # print(class_names[node.y], class_names[Y])
  238. # if node.y == 0:
  239. # return -1
  240. return node.y
  241. else:
  242. if X[node.idx] <= node.idx_value:
  243. return predit_one(X,Y,node.left)
  244. else:
  245. return predit_one(X, Y, node.right)
  246. class MyDT(object):
  247. criterion = None
  248. max_depth = None
  249. root_node = None
  250. def __init__(self, criterion, max_depth, max_features=1):
  251. self.criterion = criterion
  252. self.max_depth = max_depth
  253. def fit(self, Xtrain, Ytrain, sample_weight=None):
  254. if sample_weight is None:
  255. sample_weight = np.ones(Ytrain.shape[0]) / Ytrain.shape[0]
  256. self.root_node = self.do_fit(Xtrain, Ytrain, 0, sample_weight)
  257. def do_fit(self, Xtrain, Ytrain, depth, weights):
  258. if is_end(Ytrain):
  259. # print('这个是叶子节点')
  260. return leaf_node(Ytrain, weights)
  261. if depth >= self.max_depth:
  262. return leaf_node(Ytrain, weights)
  263. if self.criterion == 'entropy':
  264. i, mean, min_ent = cal_ent_attr(Xtrain, Ytrain, weights)
  265. elif self.criterion == 'C4.5':
  266. i, mean, min_ent = cal_ent_attr_c45(Xtrain, Ytrain, weights)
  267. else:
  268. i, mean, min_ent = cal_gini_attr(Xtrain, Ytrain, weights)
  269. total_ent = 0 # calc_ent(Ytrain)
  270. # print("第", i, "个属性,mean:", mean)
  271. # 生成节点
  272. parent_node = TreeNode(i, mean, total_ent - min_ent, False, None, len(Ytrain), distrib(Ytrain))
  273. # 切分数据
  274. right_position = Xtrain[:, i] > mean
  275. right_Ytrain = Ytrain[right_position]
  276. right_Xtrain = Xtrain[right_position]
  277. # right_Xtrain = np.delete(right_Xtrain, i, axis=1) # 这个属性还可以再被切分
  278. right_node = self.do_fit(right_Xtrain, right_Ytrain, depth + 1, weights[right_position])
  279. left_position = Xtrain[:, i] <= mean
  280. left_Ytrain = Ytrain[left_position]
  281. left_Xtrain = Xtrain[left_position]
  282. # left_Xtrain = np.delete(left_Xtrain, i, axis=1)
  283. left_node = self.do_fit(left_Xtrain, left_Ytrain, depth + 1, weights[left_position])
  284. parent_node.left = left_node
  285. parent_node.right = right_node
  286. return parent_node
  287. def predict(self, Xtest):
  288. result = []
  289. for i in range(Xtest.shape[0]):
  290. result.append(predit_one(Xtest[i], None, self.root_node))
  291. return np.array(result)
  292. def score(self, Xtest, Ytest):
  293. result = self.predict(Xtest)
  294. return sum(result == Ytest)/Ytest.shape[0]
  295. @staticmethod
  296. def export(tree, feature_names, class_names):
  297. nodes = tree.root_node
  298. print_width([nodes], 1, feature_names, class_names)
  299. if __name__ == '__main__':
  300. Xtrain, Xtest, Ytrain, Ytest = read_data()
  301. print(calc_ent1(Ytrain))
  302. weights = np.ones(len(Ytrain))/Ytrain.shape[0]
  303. print("熵值", calc_ent(Ytrain))
  304. print("熵值", calc_ent(Ytrain, weights))
  305. print("基尼指数", cal_gini(Ytrain))
  306. print("信息增益率", cal_ent_attr_c45(Xtrain, Ytrain, weights))
  307. clf = MyDT(criterion="entropy", max_depth=1,)
  308. clf.fit(Xtrain, Ytrain, weights)
  309. # print_width([node], 1)
  310. print(clf.predict(Xtest))
  311. print(clf.score(Xtest, Ytest))
  312. print(clf.score(Xtrain, Ytrain))
  313. MyDT.export(clf, feature_name, class_names)