#!/usr/bin/python # -*- coding: UTF-8 -*- ''' 最简单的mse ''' import sys import os sys.path.append(os.path.abspath('..')) from util.config import config import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn import metrics from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from imblearn.over_sampling import RandomOverSampler import joblib def curce_data(x,y,y_pred): x=x.tolist() y=y.tolist() y_pred=y_pred.tolist() results=zip(x,y,y_pred) results=["{},{},{}".format(s[0],s[1][0],s[2][0]) for s in results ] return results def read_data(path): with open(path) as f : lines=f.readlines() lines=[eval(line.strip()) for line in lines] X,z,y=zip(*lines) X=np.array(X) y=np.array(y) return X,y def demo(file, model_file): X_train,y_train=read_data(file) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3) # 一个对象,它代表的线性回归模型,它的成员变量,就已经有了w,b. 刚生成w和b的时候 是随机的 ros = RandomOverSampler(random_state=22) X_rsampled, y_resampled = ros.fit_resample(Xtrain, Ytrain) model = LogisticRegression(max_iter=1200) # 一调用这个函数,就会不停地找合适的w和b 直到误差最小 model.fit(X_rsampled, y_resampled) # 打印W # print(model.coef_) # 打印b print(model.intercept_) # 模型已经训练完毕,用模型看下在训练集的表现 # y_pred_train = model.predict(Xtrain) # # sklearn 求解训练集的mse # # y_train 在训练集上 真实的y值 # # y_pred_train 通过模型预测出来的y值 # # 计算 (y_train-y_pred_train)^2/n # train_mse = metrics.mean_squared_error(Ytrain, y_pred_train) # print("train准确率:", accuracy_score(y_pred_train, Ytrain)) # # # 看下在测试集上的效果 y_pred_test = model.predict(Xtest) print("test准确率:", accuracy_score(y_pred_test, Ytest)) # 保存模型 joblib.dump(model, model_file) def demo_1(file, model_file): X_train,y_train=read_data(file) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.25) # 随机过采样 ros = RandomOverSampler(random_state=22) X_rsampled, y_resampled = ros.fit_resample(X_train, y_train) model = SVC(kernel='linear') # 一调用这个函数,就会不停地找合适的w和b 直到误差最小 model.fit(X_rsampled, y_resampled) # 打印W # print(model.coef_) # 打印b print(model.intercept_) # # 模型已经训练完毕,用模型看下在训练集的表现 # y_pred_train = model.predict(Xtrain) # # sklearn 求解训练集的mse # # y_train 在训练集上 真实的y值 # # y_pred_train 通过模型预测出来的y值 # # 计算 (y_train-y_pred_train)^2/n # train_mse = metrics.mean_squared_error(Ytrain, y_pred_train) # print("train准确率:", accuracy_score(y_pred_train, Ytrain)) # # # 看下在测试集上的效果 # y_pred_test = model.predict(Xtest) # print("test准确率:", accuracy_score(y_pred_test, Ytest)) # # 保存模型 joblib.dump(model, model_file) from sklearn.ensemble import RandomForestClassifier def demo_2(file, model_file): X_train,y_train=read_data(file) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.25) rfc = RandomForestClassifier(random_state=0,n_estimators=20,max_depth=4) rfc = rfc.fit(Xtrain,Ytrain) print(rfc.score(Xtest,Ytest)) # 看下在测试集上的效果 y_pred_test = rfc.predict(Xtest) print("test准确率:", accuracy_score(y_pred_test, Ytest)) # 保存模型 joblib.dump(rfc, model_file) from sklearn.ensemble import GradientBoostingClassifier def demo_3(file, model_file): X_train,y_train=read_data(file) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3) gbm2 = GradientBoostingClassifier(n_estimators=320, max_depth=6, learning_rate=0.7, max_features='sqrt', random_state=10) rfc = gbm2.fit(Xtrain,Ytrain) print(gbm2.score(Xtest,Ytest)) # 看下在测试集上的效果 y_pred_test = gbm2.predict(Xtest) print("test准确率:", accuracy_score(y_pred_test, Ytest)) # 保存模型 joblib.dump(gbm2, model_file) if __name__ == '__main__': root_dir = 'D:\\data\\quantization\\jqxx2\\' model_dir = 'D:\\data\\quantization\\jqxx2_svm_model\\' m = '000001.SH.log' # 12 demo_1(root_dir + m, model_dir + str(m)[:6] + '.pkl')