天天看點

回歸調庫練習回歸調庫

回歸調庫

相信許多人對調庫充滿了恐懼,那些不同的庫讓人看得眼花缭亂,本次代碼分享就是以波士頓房價為例來做預測模型,并采用網格搜尋來确定最優參數,最後再用最優參數确定最優模型再列印其評估名額,進而對相關調庫進行對比記憶

# 1.回歸算法
# -資料:boston房價
from sklearn.datasets import load_boston#導入波士頓房價資料集
from sklearn.pipeline import Pipeline#導入管道機制
from sklearn.preprocessing import PolynomialFeatures,StandardScaler#導入多項式特征,特征縮放
from sklearn.decomposition import PCA#導入PCA降維
from sklearn.linear_model import Ridge,Lasso#導入L1,L2正則
from sklearn.neighbors import KNeighborsRegressor#導入KNN
from sklearn.tree import DecisionTreeRegressor#導入回歸樹
from sklearn.ensemble import RandomForestRegressor#導入随機森林
from sklearn.model_selection import GridSearchCV,train_test_split#導入網格搜尋,資料切分
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error#導入評估名額
# -加載、分割資料集
data=load_boston()
x=data.data
y=data.target
trainx,testx,trainy,testy=train_test_split(x,y,train_size=0.7)
# -建立多項式PolynomialFeature模型
# -PCA降維(降至二維)
# -縮放
# -将上述資料利用管道pipeline機制處理
poly=PolynomialFeatures(degree=3)
std=StandardScaler()
pca=PCA(n_components=2)
pip=Pipeline([('poly',poly),('std',std),('pca',pca)])
trainx=pip.fit_transform(trainx)
testx=pip.fit_transform(testx)
# -再利用Rdige/Lasso/Linearegression/KNN/決策樹/随機森林等算法,建立網格搜尋模型,得到最好模型
# -輸出最好模型的測試集的R2值,rmse, mse等評估名額
param_l2={'alpha':[0.01,0.1,1,10,100]}
l2=Ridge()
grid_search_l2=GridSearchCV(l2,param_grid=param_l2)
grid_search_l2.fit(trainx,trainy)
l2_best=grid_search_l2.best_params_
print('嶺回歸最優參數',l2_best)
l2best=Ridge(alpha=l2_best['alpha'])
l2best.fit(trainx,trainy)
testh_l2=l2best.predict(testx)
print('嶺回歸r方',r2_score(testy,testh_l2))
print('嶺回歸均方誤差',mean_squared_error(testy,testh_l2))
print('嶺回歸均方誤差根',mean_squared_error(testy,testh_l2)**0.5)
print('嶺回歸均絕對值誤差',mean_absolute_error(testy,testh_l2))

param_l1={'alpha':[0.01,0.1,1,10,100]}
l1=Lasso()
grid_search_l1=GridSearchCV(l1,param_grid=param_l1)
grid_search_l1.fit(trainx,trainy)
l1_best=grid_search_l1.best_params_
print('套索回歸最優參數',l1_best)
l1best=Lasso(alpha=l1_best['alpha'])
l1best.fit(trainx,trainy)
testh_l1=l1best.predict(testx)
print('套索回歸r方',r2_score(testy,testh_l1))
print('套索回歸均方誤差',mean_squared_error(testy,testh_l1))
print('套索回歸均方誤差根',mean_squared_error(testy,testh_l1)**0.5)
print('套索回歸均絕對值誤差',mean_absolute_error(testy,testh_l1))

param_knn={'n_neighbors':[3,4,5,6,7,8]}
knn=KNeighborsRegressor()
grid_search_knn=GridSearchCV(knn,param_grid=param_knn)
grid_search_knn.fit(trainx,trainy)
knn_best=grid_search_knn.best_params_
print('KNN最優參數',knn_best)
knnBest=KNeighborsRegressor(n_neighbors=knn_best['n_neighbors'])
knnBest.fit(trainx,trainy)
testh_knn=knnBest.predict(testx)
print('KNNr方',r2_score(testy,testh_knn))
print('KNN均方誤差',mean_squared_error(testy,testh_knn))
print('KNN均方誤差根',mean_squared_error(testy,testh_knn)**0.5)
print('KNN均絕對值誤差',mean_absolute_error(testy,testh_knn))

param_dtr={'max_depth':[3,4,5,6,7]}
dtr=DecisionTreeRegressor()
grid_search_dtr=GridSearchCV(dtr,param_grid=param_dtr)
grid_search_dtr.fit(trainx,trainy)
dtr_best=grid_search_dtr.best_params_
print('回歸樹最優參數',dtr_best)
dtrBest=DecisionTreeRegressor(max_depth=dtr_best['max_depth'])
dtrBest.fit(trainx,trainy)
testh_dtr=dtrBest.predict(testx)
print('回歸樹r方',r2_score(testy,testh_dtr))
print('回歸樹均方誤差',mean_squared_error(testy,testh_dtr))
print('回歸樹均方誤差根',mean_squared_error(testy,testh_dtr)**0.5)
print('回歸樹均絕對值誤差',mean_absolute_error(testy,testh_dtr))

param_forest={'n_estimators':[5,10,50,100]}
forest=RandomForestRegressor()
grid_search_forest=GridSearchCV(forest,param_grid=param_forest)
grid_search_forest.fit(trainx,trainy)
forest_best=grid_search_forest.best_params_
print('随機森林最優參數',forest_best)
forestBest=RandomForestRegressor(n_estimators=forest_best['n_estimators'])
forestBest.fit(trainx,trainy)
testh_forest=forestBest.predict(testx)
print('随機森林r方',r2_score(testy,testh_dtr))
print('随機森林均方誤差',mean_squared_error(testy,testh_dtr))
print('随機森林均方誤差根',mean_squared_error(testy,testh_dtr)**0.5)
print('随機森林均絕對值誤差',mean_absolute_error(testy,testh_dtr))
           

其最終運作效果如下:

嶺回歸最優參數 {‘alpha’: 100}

嶺回歸r方 0.2815626620061432

嶺回歸均方誤差 60.2892053502206

嶺回歸均方誤差根 7.764612376044319

嶺回歸均絕對值誤差 5.515834261907906

套索回歸最優參數 {‘alpha’: 0.1}

套索回歸r方 0.28164723100519473

套索回歸均方誤差 60.28210856184332

套索回歸均方誤差根 7.764155366930992

套索回歸均絕對值誤差 5.5153861879821

KNN最優參數 {‘n_neighbors’: 8}

KNNr方 0.17824876248539345

KNN均方誤差 68.95901212993421

KNN均方誤差根 8.304156316564267

KNN均絕對值誤差 5.608141447368421

回歸樹最優參數 {‘max_depth’: 3}

回歸樹r方 0.15175727360697133

回歸樹均方誤差 71.18210206215309

回歸樹均方誤差根 8.436948622704366

回歸樹均絕對值誤差 5.820938902402186

随機森林最優參數 {‘n_estimators’: 100}

随機森林r方 0.15175727360697133

随機森林均方誤差 71.18210206215309

随機森林均方誤差根 8.436948622704366

随機森林均絕對值誤差 5.820938902402186