train_jqxx_index.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. '''
  4. 最简单的mse
  5. '''
  6. import sys
  7. import os
  8. sys.path.append(os.path.abspath('..'))
  9. from util.config import config
  10. import numpy as np
  11. from sklearn.linear_model import LogisticRegression
  12. from sklearn.model_selection import GridSearchCV
  13. from sklearn.svm import SVC
  14. from sklearn import metrics
  15. from sklearn.model_selection import train_test_split
  16. from sklearn.metrics import accuracy_score
  17. from imblearn.over_sampling import RandomOverSampler
  18. import joblib
  19. def curce_data(x,y,y_pred):
  20. x=x.tolist()
  21. y=y.tolist()
  22. y_pred=y_pred.tolist()
  23. results=zip(x,y,y_pred)
  24. results=["{},{},{}".format(s[0],s[1][0],s[2][0]) for s in results ]
  25. return results
  26. def read_data(path):
  27. with open(path) as f :
  28. lines=f.readlines()
  29. lines=[eval(line.strip()) for line in lines]
  30. X,z,y=zip(*lines)
  31. X=np.array(X)
  32. y=np.array(y)
  33. return X,y
  34. def demo(file, model_file):
  35. X_train,y_train=read_data(file)
  36. Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
  37. # 一个对象,它代表的线性回归模型,它的成员变量,就已经有了w,b. 刚生成w和b的时候 是随机的
  38. ros = RandomOverSampler(random_state=22)
  39. X_rsampled, y_resampled = ros.fit_resample(Xtrain, Ytrain)
  40. model = LogisticRegression(max_iter=1200)
  41. # 一调用这个函数,就会不停地找合适的w和b 直到误差最小
  42. model.fit(X_rsampled, y_resampled)
  43. # 打印W
  44. # print(model.coef_)
  45. # 打印b
  46. print(model.intercept_)
  47. # 模型已经训练完毕,用模型看下在训练集的表现
  48. # y_pred_train = model.predict(Xtrain)
  49. # # sklearn 求解训练集的mse
  50. # # y_train 在训练集上 真实的y值
  51. # # y_pred_train 通过模型预测出来的y值
  52. # # 计算 (y_train-y_pred_train)^2/n
  53. # train_mse = metrics.mean_squared_error(Ytrain, y_pred_train)
  54. # print("train准确率:", accuracy_score(y_pred_train, Ytrain))
  55. #
  56. # # 看下在测试集上的效果
  57. y_pred_test = model.predict(Xtest)
  58. print("test准确率:", accuracy_score(y_pred_test, Ytest))
  59. # 保存模型
  60. joblib.dump(model, model_file)
  61. def demo_1(file, model_file):
  62. X_train,y_train=read_data(file)
  63. Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.25)
  64. # 随机过采样
  65. ros = RandomOverSampler(random_state=22)
  66. X_rsampled, y_resampled = ros.fit_resample(X_train, y_train)
  67. model = SVC(kernel='linear')
  68. # 一调用这个函数,就会不停地找合适的w和b 直到误差最小
  69. model.fit(X_rsampled, y_resampled)
  70. # 打印W
  71. # print(model.coef_)
  72. # 打印b
  73. print(model.intercept_)
  74. # # 模型已经训练完毕,用模型看下在训练集的表现
  75. # y_pred_train = model.predict(Xtrain)
  76. # # sklearn 求解训练集的mse
  77. # # y_train 在训练集上 真实的y值
  78. # # y_pred_train 通过模型预测出来的y值
  79. # # 计算 (y_train-y_pred_train)^2/n
  80. # train_mse = metrics.mean_squared_error(Ytrain, y_pred_train)
  81. # print("train准确率:", accuracy_score(y_pred_train, Ytrain))
  82. #
  83. # # 看下在测试集上的效果
  84. y_pred_test = model.predict(Xtest)
  85. print("test准确率:", accuracy_score(y_pred_test, Ytest))
  86. # # 保存模型
  87. joblib.dump(model, model_file)
  88. from sklearn.ensemble import RandomForestClassifier
  89. def demo_2(file, model_file):
  90. X_train,y_train=read_data(file)
  91. Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.25)
  92. rfc = RandomForestClassifier(random_state=0,n_estimators=20,max_depth=4)
  93. rfc = rfc.fit(Xtrain,Ytrain)
  94. print(rfc.score(Xtest,Ytest))
  95. # 看下在测试集上的效果
  96. y_pred_test = rfc.predict(Xtest)
  97. print("test准确率:", accuracy_score(y_pred_test, Ytest))
  98. # 保存模型
  99. joblib.dump(rfc, model_file)
  100. from sklearn.ensemble import GradientBoostingClassifier
  101. def demo_4(file, model_file):
  102. X_train,y_train=read_data(file)
  103. # Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
  104. param_test1 = {'max_depth':range(3,13, 2), "min_samples_split":range(80, 800, 50)}
  105. gbm2 = GradientBoostingClassifier(n_estimators=130, learning_rate=0.7,
  106. max_features='sqrt', random_state=10)
  107. gseach1 = GridSearchCV(estimator=gbm2, param_grid=param_test1, scoring="roc_auc", cv = 5)
  108. gseach1.fit(X_train,y_train)
  109. print(gseach1.param_grid, gseach1.best_params_, gseach1.best_score_)
  110. # rfc = gbm2.fit(Xtrain,Ytrain)
  111. # print(gbm2.score(Xtest,Ytest))
  112. # # 看下在测试集上的效果
  113. # y_pred_test = gbm2.predict(Xtest)
  114. # print("test准确率:", accuracy_score(y_pred_test, Ytest))
  115. # # 保存模型
  116. # joblib.dump(gbm2, model_file)
  117. def demo_3(file, model_file):
  118. X_train,y_train=read_data(file)
  119. # Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
  120. # gbm2 = GradientBoostingClassifier(n_estimators=120, max_depth=4, min_samples_split=180, learning_rate=0.7,
  121. # max_features='sqrt', random_state=101)
  122. gbm2 = GradientBoostingClassifier(n_estimators=15, max_depth=4, min_samples_split=140, learning_rate=0.7,
  123. max_features='sqrt', random_state=101)
  124. # gbm2 = GradientBoostingClassifier(n_estimators=30, max_depth=4, min_samples_split=200, learning_rate=0.7,
  125. # max_features='sqrt', random_state=101)
  126. rfc = gbm2.fit(X_train,y_train)
  127. print(gbm2.feature_importances_)
  128. # print(gbm2.score(Xtest,Ytest))
  129. # 看下在测试集上的效果
  130. # y_pred_test = gbm2.predict(Xtest)
  131. # print("test准确率:", accuracy_score(y_pred_test, Ytest))
  132. # 保存模型
  133. joblib.dump(gbm2, model_file)
  134. if __name__ == '__main__':
  135. root_dir = 'D:\\data\\quantization\\jqxx2\\'
  136. model_dir = 'D:\\data\\quantization\\jqxx2_tree_model\\'
  137. m = '399325.SZ.log' #12
  138. demo_3(root_dir + m, model_dir + str(m)[:6] + '.pkl')