# -*- encoding:utf-8 -*- from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split import numpy as np from util.config import config from sklearn.tree import DecisionTreeRegressor from sklearn import metrics ''' 参数 含义 criterion 不纯度的衡量指标,有基尼系数和信息熵两种选择 max_depth 树的最大深度,超过最大深度的树枝都会被剪掉 min_samples_leaf 一个节点在分枝后的每个子节点都必须包含至少min_samples_leaf个训练样本,否则分枝就不会发生 min_samples_split 一个节点必须要包含至少min_samples_split个训练样本,这个节点才允许被分枝,否则分枝就不会发生 max_features max_features限制分枝时考虑的特征个数,超过限制个数的特征都会被舍弃,默认值为总特征个数开平方取整 min_impurity_decrease 限制信息增益的大小,信息增益小于设定数值的分枝不会发生 ''' def demo_wine(): wine = load_wine() #print wine.data #print wine.target Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3) rfc = RandomForestClassifier(random_state=0,n_estimators=10) rfc = rfc.fit(Xtrain,Ytrain) print(rfc.score(Xtest,Ytest)) def read_data(path): with open(path) as f : lines=f.readlines() lines=[eval(line.strip()) for line in lines] X,y=zip(*lines) X=np.array(X) y=np.array(y) return X,y def demo(): X_train, y_train = read_data(config.get('application', 'train_data_path')) X_test, y_test = read_data(config.get('application', 'train_data_path')) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3) rfc = RandomForestRegressor(random_state=0, n_estimators=10, max_depth=10) rfc = rfc.fit(Xtrain, Ytrain) print(rfc.score(Xtest, Ytest)) print(rfc.predict(X_test)) # print(rfc.score(Xtest, y_test)) y_pred_test = rfc.predict(X_test) test_mse = metrics.mean_squared_error(y_test, y_pred_test) print("测试集MSE:", test_mse) if __name__ == '__main__': demo()