回归调库

相信许多人对调库充满了恐惧，那些不同的库让人看得眼花缭乱，本次代码分享就是以波士顿房价为例来做预测模型，并采用网格搜索来确定最优参数，最后再用最优参数确定最优模型再打印其评估指标，从而对相关调库进行对比记忆

# 1.回归算法
# -数据：boston房价
from sklearn.datasets import load_boston#导入波士顿房价数据集
from sklearn.pipeline import Pipeline#导入管道机制
from sklearn.preprocessing import PolynomialFeatures,StandardScaler#导入多项式特征，特征缩放
from sklearn.decomposition import PCA#导入PCA降维
from sklearn.linear_model import Ridge,Lasso#导入L1，L2正则
from sklearn.neighbors import KNeighborsRegressor#导入KNN
from sklearn.tree import DecisionTreeRegressor#导入回归树
from sklearn.ensemble import RandomForestRegressor#导入随机森林
from sklearn.model_selection import GridSearchCV,train_test_split#导入网格搜索，数据切分
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error#导入评估指标
# -加载、分割数据集
data=load_boston()
x=data.data
y=data.target
trainx,testx,trainy,testy=train_test_split(x,y,train_size=0.7)
# -建立多项式PolynomialFeature模型
# -PCA降维(降至二维)
# -缩放
# -将上述数据利用管道pipeline机制处理
poly=PolynomialFeatures(degree=3)
std=StandardScaler()
pca=PCA(n_components=2)
pip=Pipeline([('poly',poly),('std',std),('pca',pca)])
trainx=pip.fit_transform(trainx)
testx=pip.fit_transform(testx)
# -再利用Rdige/Lasso/Linearegression/KNN/决策树/随机森林等算法，建立网格搜索模型，得到最好模型
# -输出最好模型的测试集的R2值，rmse, mse等评估指标
param_l2={'alpha':[0.01,0.1,1,10,100]}
l2=Ridge()
grid_search_l2=GridSearchCV(l2,param_grid=param_l2)
grid_search_l2.fit(trainx,trainy)
l2_best=grid_search_l2.best_params_
print('岭回归最优参数',l2_best)
l2best=Ridge(alpha=l2_best['alpha'])
l2best.fit(trainx,trainy)
testh_l2=l2best.predict(testx)
print('岭回归r方',r2_score(testy,testh_l2))
print('岭回归均方误差',mean_squared_error(testy,testh_l2))
print('岭回归均方误差根',mean_squared_error(testy,testh_l2)**0.5)
print('岭回归均绝对值误差',mean_absolute_error(testy,testh_l2))

param_l1={'alpha':[0.01,0.1,1,10,100]}
l1=Lasso()
grid_search_l1=GridSearchCV(l1,param_grid=param_l1)
grid_search_l1.fit(trainx,trainy)
l1_best=grid_search_l1.best_params_
print('套索回归最优参数',l1_best)
l1best=Lasso(alpha=l1_best['alpha'])
l1best.fit(trainx,trainy)
testh_l1=l1best.predict(testx)
print('套索回归r方',r2_score(testy,testh_l1))
print('套索回归均方误差',mean_squared_error(testy,testh_l1))
print('套索回归均方误差根',mean_squared_error(testy,testh_l1)**0.5)
print('套索回归均绝对值误差',mean_absolute_error(testy,testh_l1))

param_knn={'n_neighbors':[3,4,5,6,7,8]}
knn=KNeighborsRegressor()
grid_search_knn=GridSearchCV(knn,param_grid=param_knn)
grid_search_knn.fit(trainx,trainy)
knn_best=grid_search_knn.best_params_
print('KNN最优参数',knn_best)
knnBest=KNeighborsRegressor(n_neighbors=knn_best['n_neighbors'])
knnBest.fit(trainx,trainy)
testh_knn=knnBest.predict(testx)
print('KNNr方',r2_score(testy,testh_knn))
print('KNN均方误差',mean_squared_error(testy,testh_knn))
print('KNN均方误差根',mean_squared_error(testy,testh_knn)**0.5)
print('KNN均绝对值误差',mean_absolute_error(testy,testh_knn))

param_dtr={'max_depth':[3,4,5,6,7]}
dtr=DecisionTreeRegressor()
grid_search_dtr=GridSearchCV(dtr,param_grid=param_dtr)
grid_search_dtr.fit(trainx,trainy)
dtr_best=grid_search_dtr.best_params_
print('回归树最优参数',dtr_best)
dtrBest=DecisionTreeRegressor(max_depth=dtr_best['max_depth'])
dtrBest.fit(trainx,trainy)
testh_dtr=dtrBest.predict(testx)
print('回归树r方',r2_score(testy,testh_dtr))
print('回归树均方误差',mean_squared_error(testy,testh_dtr))
print('回归树均方误差根',mean_squared_error(testy,testh_dtr)**0.5)
print('回归树均绝对值误差',mean_absolute_error(testy,testh_dtr))

param_forest={'n_estimators':[5,10,50,100]}
forest=RandomForestRegressor()
grid_search_forest=GridSearchCV(forest,param_grid=param_forest)
grid_search_forest.fit(trainx,trainy)
forest_best=grid_search_forest.best_params_
print('随机森林最优参数',forest_best)
forestBest=RandomForestRegressor(n_estimators=forest_best['n_estimators'])
forestBest.fit(trainx,trainy)
testh_forest=forestBest.predict(testx)
print('随机森林r方',r2_score(testy,testh_dtr))
print('随机森林均方误差',mean_squared_error(testy,testh_dtr))
print('随机森林均方误差根',mean_squared_error(testy,testh_dtr)**0.5)
print('随机森林均绝对值误差',mean_absolute_error(testy,testh_dtr))

其最终运行效果如下：

岭回归最优参数 {‘alpha’: 100}

岭回归r方 0.2815626620061432

岭回归均方误差 60.2892053502206

岭回归均方误差根 7.764612376044319

岭回归均绝对值误差 5.515834261907906

套索回归最优参数 {‘alpha’: 0.1}

套索回归r方 0.28164723100519473

套索回归均方误差 60.28210856184332

套索回归均方误差根 7.764155366930992

套索回归均绝对值误差 5.5153861879821

KNN最优参数 {‘n_neighbors’: 8}

KNNr方 0.17824876248539345

KNN均方误差 68.95901212993421

KNN均方误差根 8.304156316564267

KNN均绝对值误差 5.608141447368421

回归树最优参数 {‘max_depth’: 3}

回归树r方 0.15175727360697133

回归树均方误差 71.18210206215309

回归树均方误差根 8.436948622704366

回归树均绝对值误差 5.820938902402186

随机森林最优参数 {‘n_estimators’: 100}

随机森林r方 0.15175727360697133

随机森林均方误差 71.18210206215309

随机森林均方误差根 8.436948622704366

随机森林均绝对值误差 5.820938902402186

回归调库练习回归调库

回归调库

继续阅读

libsvm for python 安装

学习软件测试基础测试第七天

Zeppelin 配置访问 REST APIApache Zeppelin Configuration REST API

【Torch】最简洁logging使用指南

笔试面试题目：滑动窗口(二)

27. Remove Element(列表)题目代码

数据结构与算法（27）——排序（二）

Dijkstra--简易版（最短路径）

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

Cloud Studio初体验

使用 ctypes 进行 Python 和 C 的混合编程

【python】【数据处理】画多维数据分布图

【python】netconf协议对接管理设备

「Python 网络自动化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 网络设备

在python中创建excel并写入

hdu7108哈希