my_ada_boost.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- encoding:utf-8 -*-
  2. from sklearn.datasets import load_wine, load_iris, load_breast_cancer
  3. from sklearn.model_selection import train_test_split
  4. from sklearn.tree import DecisionTreeClassifier
  5. import numpy as np
  6. from tree.my_tree import MyDT
  7. def read_data():
  8. wine = load_breast_cancer()
  9. Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
  10. for i in range(len(Ytrain)):
  11. if Ytrain[i] == 0:
  12. Ytrain[i] = -1
  13. for i in range(len(Ytest)):
  14. if Ytest[i] == 0:
  15. Ytest[i] = -1
  16. return Xtrain, Xtest, Ytrain, Ytest
  17. def fit(Xtrain, Ytrain):
  18. # 设置数据初始权重
  19. w = np.ones(len(Ytrain))/Ytrain.shape[0]
  20. # 决策树权重
  21. alphas = []
  22. # 决策树数组
  23. trees = []
  24. pn = 1/len(Ytrain)
  25. # print(Ytest)
  26. for i in range(30):
  27. # 训练决策树
  28. clf = MyDT(criterion="gini", max_features=1, max_depth=1,
  29. ) # 实例化,criterion不写的话默认是基尼系数
  30. clf.fit(Xtrain, Ytrain, w)
  31. Ypredit = clf.predict(Xtrain)
  32. error = np.dot(Ypredit != Ytrain, w)
  33. p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0]
  34. if error > 0.5:
  35. continue
  36. if error == 0:
  37. error = 0.001
  38. print("第", i, "轮错误率", p_error, error)
  39. alpha = 0.5*np.log2(1/error - 1)
  40. # 更新权重
  41. for j in range(Ytrain.shape[0]):
  42. w[j] = w[j]*np.exp(-alpha*Ytrain[j]*Ypredit[j])
  43. sum_w = sum(w)
  44. w = w/sum_w
  45. alphas.append(alpha)
  46. trees.append(clf)
  47. return trees, alphas
  48. def cmp_predict(trees, alphas, Xtrain, Ytrain):
  49. predicts = []
  50. for tree in trees:
  51. predicts.append(tree.predict(Xtrain))
  52. # 结果加权
  53. result = np.zeros(len(Xtrain), float)
  54. for p in predicts:
  55. r = 0
  56. for w_alpha in alphas:
  57. r += w_alpha * p
  58. result = result + r
  59. # print("sign前:" , result)
  60. result = np.sign(result)
  61. # print("sign后:", result)
  62. # print(1- sum(np.bitwise_xor(Ytest, result))/len(result))
  63. # print(result == Ytest)
  64. x = len([i for i in result == Ytrain if i])/len(result)
  65. x = x + 0.5*len([i for i in result if i==0])/len(result)
  66. print(x)
  67. w = np.array([1/len(Xtrain) for i in range(len(Xtrain))])
  68. Ypredit = trees[0].predict(Xtrain)
  69. p_error = sum(Ypredit != Ytrain)/Ytrain.shape[0]
  70. print(1-p_error)
  71. if __name__ == '__main__':
  72. Xtrain, Xtest, Ytrain, Ytest = read_data()
  73. trees, alphas = fit(Xtrain, Ytrain)
  74. print(alphas)
  75. print('训练集上比较')
  76. cmp_predict(trees, alphas, Xtrain, Ytrain)
  77. print('测试集上比较')
  78. cmp_predict(trees, alphas, Xtest, Ytest)