天天看点

回归调库练习回归调库

回归调库

相信许多人对调库充满了恐惧,那些不同的库让人看得眼花缭乱,本次代码分享就是以波士顿房价为例来做预测模型,并采用网格搜索来确定最优参数,最后再用最优参数确定最优模型再打印其评估指标,从而对相关调库进行对比记忆

# 1.回归算法
# -数据:boston房价
from sklearn.datasets import load_boston#导入波士顿房价数据集
from sklearn.pipeline import Pipeline#导入管道机制
from sklearn.preprocessing import PolynomialFeatures,StandardScaler#导入多项式特征,特征缩放
from sklearn.decomposition import PCA#导入PCA降维
from sklearn.linear_model import Ridge,Lasso#导入L1,L2正则
from sklearn.neighbors import KNeighborsRegressor#导入KNN
from sklearn.tree import DecisionTreeRegressor#导入回归树
from sklearn.ensemble import RandomForestRegressor#导入随机森林
from sklearn.model_selection import GridSearchCV,train_test_split#导入网格搜索,数据切分
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error#导入评估指标
# -加载、分割数据集
data=load_boston()
x=data.data
y=data.target
trainx,testx,trainy,testy=train_test_split(x,y,train_size=0.7)
# -建立多项式PolynomialFeature模型
# -PCA降维(降至二维)
# -缩放
# -将上述数据利用管道pipeline机制处理
poly=PolynomialFeatures(degree=3)
std=StandardScaler()
pca=PCA(n_components=2)
pip=Pipeline([('poly',poly),('std',std),('pca',pca)])
trainx=pip.fit_transform(trainx)
testx=pip.fit_transform(testx)
# -再利用Rdige/Lasso/Linearegression/KNN/决策树/随机森林等算法,建立网格搜索模型,得到最好模型
# -输出最好模型的测试集的R2值,rmse, mse等评估指标
param_l2={'alpha':[0.01,0.1,1,10,100]}
l2=Ridge()
grid_search_l2=GridSearchCV(l2,param_grid=param_l2)
grid_search_l2.fit(trainx,trainy)
l2_best=grid_search_l2.best_params_
print('岭回归最优参数',l2_best)
l2best=Ridge(alpha=l2_best['alpha'])
l2best.fit(trainx,trainy)
testh_l2=l2best.predict(testx)
print('岭回归r方',r2_score(testy,testh_l2))
print('岭回归均方误差',mean_squared_error(testy,testh_l2))
print('岭回归均方误差根',mean_squared_error(testy,testh_l2)**0.5)
print('岭回归均绝对值误差',mean_absolute_error(testy,testh_l2))

param_l1={'alpha':[0.01,0.1,1,10,100]}
l1=Lasso()
grid_search_l1=GridSearchCV(l1,param_grid=param_l1)
grid_search_l1.fit(trainx,trainy)
l1_best=grid_search_l1.best_params_
print('套索回归最优参数',l1_best)
l1best=Lasso(alpha=l1_best['alpha'])
l1best.fit(trainx,trainy)
testh_l1=l1best.predict(testx)
print('套索回归r方',r2_score(testy,testh_l1))
print('套索回归均方误差',mean_squared_error(testy,testh_l1))
print('套索回归均方误差根',mean_squared_error(testy,testh_l1)**0.5)
print('套索回归均绝对值误差',mean_absolute_error(testy,testh_l1))

param_knn={'n_neighbors':[3,4,5,6,7,8]}
knn=KNeighborsRegressor()
grid_search_knn=GridSearchCV(knn,param_grid=param_knn)
grid_search_knn.fit(trainx,trainy)
knn_best=grid_search_knn.best_params_
print('KNN最优参数',knn_best)
knnBest=KNeighborsRegressor(n_neighbors=knn_best['n_neighbors'])
knnBest.fit(trainx,trainy)
testh_knn=knnBest.predict(testx)
print('KNNr方',r2_score(testy,testh_knn))
print('KNN均方误差',mean_squared_error(testy,testh_knn))
print('KNN均方误差根',mean_squared_error(testy,testh_knn)**0.5)
print('KNN均绝对值误差',mean_absolute_error(testy,testh_knn))

param_dtr={'max_depth':[3,4,5,6,7]}
dtr=DecisionTreeRegressor()
grid_search_dtr=GridSearchCV(dtr,param_grid=param_dtr)
grid_search_dtr.fit(trainx,trainy)
dtr_best=grid_search_dtr.best_params_
print('回归树最优参数',dtr_best)
dtrBest=DecisionTreeRegressor(max_depth=dtr_best['max_depth'])
dtrBest.fit(trainx,trainy)
testh_dtr=dtrBest.predict(testx)
print('回归树r方',r2_score(testy,testh_dtr))
print('回归树均方误差',mean_squared_error(testy,testh_dtr))
print('回归树均方误差根',mean_squared_error(testy,testh_dtr)**0.5)
print('回归树均绝对值误差',mean_absolute_error(testy,testh_dtr))

param_forest={'n_estimators':[5,10,50,100]}
forest=RandomForestRegressor()
grid_search_forest=GridSearchCV(forest,param_grid=param_forest)
grid_search_forest.fit(trainx,trainy)
forest_best=grid_search_forest.best_params_
print('随机森林最优参数',forest_best)
forestBest=RandomForestRegressor(n_estimators=forest_best['n_estimators'])
forestBest.fit(trainx,trainy)
testh_forest=forestBest.predict(testx)
print('随机森林r方',r2_score(testy,testh_dtr))
print('随机森林均方误差',mean_squared_error(testy,testh_dtr))
print('随机森林均方误差根',mean_squared_error(testy,testh_dtr)**0.5)
print('随机森林均绝对值误差',mean_absolute_error(testy,testh_dtr))
           

其最终运行效果如下:

岭回归最优参数 {‘alpha’: 100}

岭回归r方 0.2815626620061432

岭回归均方误差 60.2892053502206

岭回归均方误差根 7.764612376044319

岭回归均绝对值误差 5.515834261907906

套索回归最优参数 {‘alpha’: 0.1}

套索回归r方 0.28164723100519473

套索回归均方误差 60.28210856184332

套索回归均方误差根 7.764155366930992

套索回归均绝对值误差 5.5153861879821

KNN最优参数 {‘n_neighbors’: 8}

KNNr方 0.17824876248539345

KNN均方误差 68.95901212993421

KNN均方误差根 8.304156316564267

KNN均绝对值误差 5.608141447368421

回归树最优参数 {‘max_depth’: 3}

回归树r方 0.15175727360697133

回归树均方误差 71.18210206215309

回归树均方误差根 8.436948622704366

回归树均绝对值误差 5.820938902402186

随机森林最优参数 {‘n_estimators’: 100}

随机森林r方 0.15175727360697133

随机森林均方误差 71.18210206215309

随机森林均方误差根 8.436948622704366

随机森林均绝对值误差 5.820938902402186