123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- #!/usr/bin/python
- # -*- coding: UTF-8 -*-
- '''
- 最简单的mse
- '''
- import sys
- import os
- sys.path.append(os.path.abspath('..'))
- from util.config import config
- import numpy as np
- from sklearn.linear_model import LogisticRegression
- from sklearn.svm import SVC
- from sklearn import metrics
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import accuracy_score
- from imblearn.over_sampling import RandomOverSampler
- import joblib
- def curce_data(x,y,y_pred):
- x=x.tolist()
- y=y.tolist()
- y_pred=y_pred.tolist()
- results=zip(x,y,y_pred)
- results=["{},{},{}".format(s[0],s[1][0],s[2][0]) for s in results ]
- return results
- def read_data(path):
- with open(path) as f :
- lines=f.readlines()
- lines=[eval(line.strip()) for line in lines]
- X,z,y=zip(*lines)
- X=np.array(X)
- y=np.array(y)
- return X,y
- def demo(file, model_file):
- X_train,y_train=read_data(file)
- # Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
- # 一个对象,它代表的线性回归模型,它的成员变量,就已经有了w,b. 刚生成w和b的时候 是随机的
- model = LogisticRegression()
- # 一调用这个函数,就会不停地找合适的w和b 直到误差最小
- model.fit(X_train, y_train)
- # 打印W
- # print(model.coef_)
- # 打印b
- print(model.intercept_)
- # 模型已经训练完毕,用模型看下在训练集的表现
- # y_pred_train = model.predict(Xtrain)
- # # sklearn 求解训练集的mse
- # # y_train 在训练集上 真实的y值
- # # y_pred_train 通过模型预测出来的y值
- # # 计算 (y_train-y_pred_train)^2/n
- # train_mse = metrics.mean_squared_error(Ytrain, y_pred_train)
- # print("train准确率:", accuracy_score(y_pred_train, Ytrain))
- #
- # # 看下在测试集上的效果
- # y_pred_test = model.predict(Xtest)
- # print("test准确率:", accuracy_score(y_pred_test, Ytest))
- # 保存模型
- joblib.dump(model, model_file)
- def demo_1(file, model_file):
- X_train,y_train=read_data(file)
- # X_test,y_test=read_data(config.get('application', 'test_data_path'))
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
- # 随机过采样
- ros = RandomOverSampler(random_state=22)
- X_rsampled, y_resampled = ros.fit_resample(Xtrain, Ytrain)
- model = SVC(kernel='linear')
- # 一调用这个函数,就会不停地找合适的w和b 直到误差最小
- model.fit(X_rsampled, y_resampled)
- # 打印W
- # print(model.coef_)
- # 打印b
- print(model.intercept_)
- # 模型已经训练完毕,用模型看下在训练集的表现
- y_pred_train = model.predict(Xtrain)
- # sklearn 求解训练集的mse
- # y_train 在训练集上 真实的y值
- # y_pred_train 通过模型预测出来的y值
- # 计算 (y_train-y_pred_train)^2/n
- train_mse = metrics.mean_squared_error(Ytrain, y_pred_train)
- print("train准确率:", accuracy_score(y_pred_train, Ytrain))
- # 看下在测试集上的效果
- y_pred_test = model.predict(Xtest)
- print("test准确率:", accuracy_score(y_pred_test, Ytest))
- # 保存模型
- joblib.dump(model, model_file)
- from sklearn.ensemble import RandomForestClassifier
- def demo_2(file, model_file):
- X_train,y_train=read_data(file)
- # X_test,y_test=read_data(config.get('application', 'test_data_path'))
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
- rfc = RandomForestClassifier(random_state=0,n_estimators=10)
- rfc = rfc.fit(Xtrain,Ytrain)
- print(rfc.score(Xtest,Ytest))
- # 看下在测试集上的效果
- y_pred_test = rfc.predict(Xtest)
- print("test准确率:", accuracy_score(y_pred_test, Ytest))
- # 保存模型
- joblib.dump(rfc, model_file)
- from sklearn.ensemble import GradientBoostingClassifier
- def demo_3(file, model_file):
- X_train,y_train=read_data(file)
- # X_test,y_test=read_data(config.get('application', 'test_data_path'))
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
- gbm2 = GradientBoostingClassifier(n_estimators=55, max_depth=10, learning_rate=0.7,
- max_features='sqrt', random_state=10)
- rfc = gbm2.fit(Xtrain,Ytrain)
- print(gbm2.score(Xtest,Ytest))
- # 看下在测试集上的效果
- y_pred_test = gbm2.predict(Xtest)
- print("test准确率:", accuracy_score(y_pred_test, Ytest))
- # 保存模型
- joblib.dump(gbm2, model_file)
- if __name__ == '__main__':
- root_dir = 'D:\\data\\quantization\\jqxx2\\'
- model_dir = 'D:\\data\\quantization\\jqxx2_svm_model\\'
- m = '399308.SZ.log'
- demo_1(root_dir + m, model_dir + str(m)[:6] + '.pkl')
|