1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- # -*- encoding:utf-8 -*-
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.ensemble import RandomForestRegressor
- from sklearn.datasets import load_wine
- from sklearn.model_selection import train_test_split
- import numpy as np
- from sklearn.tree import DecisionTreeRegressor
- from sklearn import metrics
- '''
- 参数 含义
- criterion 不纯度的衡量指标,有基尼系数和信息熵两种选择
- max_depth 树的最大深度,超过最大深度的树枝都会被剪掉
- min_samples_leaf 一个节点在分枝后的每个子节点都必须包含至少min_samples_leaf个训练样本,否则分枝就不会发生
- min_samples_split 一个节点必须要包含至少min_samples_split个训练样本,这个节点才允许被分枝,否则分枝就不会发生
- max_features max_features限制分枝时考虑的特征个数,超过限制个数的特征都会被舍弃,默认值为总特征个数开平方取整
- min_impurity_decrease 限制信息增益的大小,信息增益小于设定数值的分枝不会发生
- '''
- def demo_wine():
- wine = load_wine()
- #print wine.data
- #print wine.target
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,wine.target,test_size=0.3)
- rfc = RandomForestClassifier(random_state=0,n_estimators=10)
- rfc = rfc.fit(Xtrain,Ytrain)
- print(rfc.score(Xtest,Ytest))
- def read_data(path):
- with open(path) as f :
- lines=f.readlines()
- lines=[eval(line.strip()) for line in lines]
- X,y=zip(*lines)
- X=np.array(X)
- y=np.array(y)
- return X,y
- def demo():
- X_train, y_train = read_data("../bbztx/train_data")
- X_test, y_test = read_data("../bbztx/test_data")
- Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_train, y_train, test_size=0.3)
- rfc = RandomForestRegressor(random_state=0, n_estimators=10, max_depth=10)
- rfc = rfc.fit(Xtrain, Ytrain)
- print(rfc.score(Xtest, Ytest))
- print(rfc.predict(X_test))
- # print(rfc.score(Xtest, y_test))
- y_pred_test = rfc.predict(X_test)
- test_mse = metrics.mean_squared_error(y_test, y_pred_test)
- print("测试集MSE:", test_mse)
- if __name__ == '__main__':
- demo()
|