天天看点

机器学习笔记04梯度提升树(GBDT)调参

# -*- coding: utf-8 -*-

'''
数据: train_modified
'''

'''
scikit-learn 梯度提升树(GBDT)算法类库
GBDT分类类: GradientBoostingClassifier; GBDT回归类: GradientBoostingRegressor

GBDT类库boosting框架参数
(1)n_estimators(最大的弱学习器个数/弱学习器最大迭代次数): 100(默认)
(2)learning_rate(每个弱学习器的权重缩减系数v, 步长): 1(默认)
        n_estimators与 learning_rate一起调参,共同决定算法的拟合效果
(3)subsample(子采样,不放回抽样):1(默认,即不使用子采样,建议取0,5~0.8)
(4)loss(损失函数)
    分类模型GradientBoostingClassifier: 1.对数似然损失函数(deviance) 默认
                                        2.指数损失函数(exponential) 等同于AdaBoosting
                                        
    回归模型GradientBoostingRegressor:  1.均方差(ls) 默认
                                        2.绝对损失(lad)
                                        3.Huber损失(huber)
                                        4.分位数损失(quantile)

决策树参数:
(1)criterion(特征选择标准):
        分类决策树DecisionTreeClassifier:'gini'(,默认,基尼系数,即CART算法), 备选'entropy'(熵,ID3/C4.5)
        回归决策树DecisionTreeRegressor: 'mse'(默认, 均方误差), 'mae'(和均值之差的绝对值之差)
(2)max_features(划分时考虑的最大特征数): 
        'None'( 默认,划分时考虑所有的特征数,特征数 < 50,建议采用默认的'None')
        'log2'(划分时最多考虑log2N个特征)
        'sqrt'/'auto'(划分时最多考虑N开根号个特征)
        'int'(考虑的特征的绝对数)
        'float'(考虑的特征百分比,即百分比*N取整后的特征数)
(3)max_depth(决策树的最大深度):
        不输入(默认, 决策树在建立时不会限制子树的深度)
        10~100(模型样本量多, 特征多)
(4)min_samples_split(内部节点再划分所需最小样本数):
        限制子树继续划分的条件,如果某节点的样本数<min_samples_split,则不会继续选取最优特征来进行划分
        2(默认,样本量小,建议采用默认值2,样本量数量级非常大,建议增大此值)
(5)min_samples_leaf(叶子节点最小样本数):
        限制叶子节点最少的样本数,如果某叶子节点数少于样本数,则会和兄弟节点一起被剪枝
        1(默认,样本量小,建议采用默认值1,样本量数量级非常大,建议增大此值)
'''

import os
import pandas as pd
import numpy as np

#读取数据
os.chdir(r'F:\python_data_mining\train_modified')
data = pd.read_csv('train_modified.csv')
data = data.drop(['ID'], axis=1)
print('数据预览: \n{}'.format(data.head()))

X = data.iloc[:, 1:]
y = data.iloc[:, 0]

print('目标变量y的样本分布: \n{}'.format(y.value_counts()))
print('=================================')

# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)
# 建立梯度提升树模型GBDT
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
gbc = GradientBoostingClassifier(random_state=12345)

gbc.fit(X_train, y_train)
pred_test = gbc.predict(X_test)
proba_test = gbc.predict_proba(X_test)[:, 1]
# 计算accuracy和AUC
print('GBDT模型参数无调优时:')
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test)))
print('=================================')

# 网格搜索GridSearchCV调优n_estimators(弱学习器最大迭代次数)参数
param_test_1 = {'n_estimators': range(10, 81, 5)}
grid_search_1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1,
                                                                  max_depth=8,
                                                                  min_samples_split=300,
                                                                  min_samples_leaf=20,
                                                                  subsample=0.8,
                                                                  max_features='sqrt',
                                                                  random_state=12345),
                             param_grid=param_test_1, scoring='roc_auc', iid=False, cv=4)
grid_search_1.fit(X_train, y_train)
pred_test_1 = grid_search_1.predict(X_test)
proba_test_1 = grid_search_1.predict_proba(X_test)[:, 1]
print('网格搜索GridSearchCV调优n_estimators(弱学习器最大迭代次数)参数:')
# print('cv_results = \n{}'.format(grid_search_1.cv_results_))
print('best_params = {}'.format(grid_search_1.best_params_))
print('best_score = {}'.format(grid_search_1.best_score_))
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_1)))
print('AUC_score = {} '.format(metrics.roc_auc_score(y_test, proba_test_1)))
print('=================================')

# 获得合适的迭代次数n_estimators=35
# 网格搜索调优max_depth和min_samples_split参数
param_test_2 = {'max_depth': range(2, 11, 2),
                'min_samples_split': range(50, 501, 50)}

grid_search_2 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=35,
                                                                  learning_rate=0.1,
                                                                  min_samples_leaf=20,
                                                                  subsample=0.8,
                                                                  max_features='sqrt',
                                                                  random_state=12345
                                                                  ),
                             param_grid=param_test_2, scoring='roc_auc', iid=False, cv=4)
grid_search_2.fit(X_test, y_test)
pred_test_2 = grid_search_2.predict(X_test)
proba_test_2 = grid_search_2.predict_proba(X_test)[:, 1]
print('网格搜索调优max_depth和min_samples_split参数')
print('best_param = {}'.format(grid_search_2.best_params_))
print('best_score = {}'.format(grid_search_2.best_score_))
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_2)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_2)))
print('=================================')

# 获得合适的max_depth(决策树最大深度)=6 以及合适的min_samples_split(内部节点再划分所需最小的样本数)=300
# 网格搜索调优min_samples_split和min_samples_leaf参数
param_test_3 = {'min_samples_split': range(100, 1201, 100),
                'min_samples_leaf': range(10,101, 10)}
grid_search_3 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=35,
                                                                  learning_rate=0.1,
                                                                  max_depth=6,
                                                                  max_features='sqrt',
                                                                  subsample=0.8,
                                                                  random_state=12345),
                             param_grid=param_test_3, scoring='roc_auc', iid=False, cv=4)
grid_search_3.fit(X_train, y_train)
pred_test_3 = grid_search_3.predict(X_test)
proba_test_3 = grid_search_3.predict_proba(X_test)[:, 1]
print('网格搜索调优min_samples_split和min_samples_leaf参数:')
print('best_param = {}'.format(grid_search_3.best_params_))
print('best_score = {}'.format(grid_search_3.best_score_))
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_3)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_3)))
print('=================================')

# 获得合适的min_samples_split(内部节点划分所需的最小样本数)=100, 和min_samples_leaf(叶子节点最小样本数)=50
# # 网格搜索调优max_features(划分时考虑的最大特征数)参数
param_test_4 = {'max_features': range(5, 31, 2)}
grid_search_4 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=35,
                                                                  learning_rate=0.1,
                                                                  max_depth=6,
                                                                  min_samples_split=100,
                                                                  min_samples_leaf=50,
                                                                  subsample=0.8,
                                                                  random_state=12345),
                             param_grid=param_test_4, scoring='roc_auc', iid=False, cv=4)
grid_search_4.fit(X_train, y_train)
pred_test_4 = grid_search_4.predict(X_test)
proba_test_4 = grid_search_4.predict_proba(X_test)[:, 1]
print('网格搜索调优max_features参数')
print('best_params = {}'.format(grid_search_4.best_params_))
print('best_score = {}'.format(grid_search_4.best_score_))
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_4)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_4)))
print('=================================')

# 获得合适的max_features(划分时考虑的最大特征数)=7
# 网格搜索调优subsample(子采样比例)参数
param_test_5 = {'subsample': [0.5, 0.55, 0.6, 0.65, 0.7, 0.75,  0.8, 0.85, 0.9]}

grid_search_5 = GridSearchCV(estimator=GradientBoostingClassifier(n_estimators=35,
                                                                  learning_rate=0.1,
                                                                  max_depth=6,
                                                                  min_samples_split=100,
                                                                  min_samples_leaf=50,
                                                                  max_features=7,
                                                                  random_state=12345),
                             param_grid=param_test_5, scoring='roc_auc', iid=False, cv=4)
grid_search_5.fit(X_train, y_train)
pred_test_5 = grid_search_5.predict(X_test)
proba_test_5 = grid_search_5.predict_proba(X_test)[:, 1]
print('网格搜索调优subsample参数:')
print('best_params = {}'.format(grid_search_5.best_params_))
print('best_score = {}'.format(grid_search_5.best_score_))
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_5)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_5)))
print('=================================')

# 获得合适的subsample(子采样比例)参数=0.8
'''
综合以上网格搜索调优参数: 
n_estimators = 35, max_depth = 6, min_samples_split = 100, min_samples_leaf = 50, max_features = 7, subsample = 0.8
'''

gbc_1 = GradientBoostingClassifier(n_estimators=35,
                                   learning_rate=0.1,
                                   max_depth=6,
                                   min_samples_split=100,
                                   min_samples_leaf=50,
                                   max_features=7,
                                   subsample=0.8,
                                   random_state=12345)
gbc_1.fit(X_train, y_train)
pred_test_6 = gbc_1.predict(X_test)
proba_test_6 = gbc_1.predict_proba(X_test)[:, 1]
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_6)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_6)))
print('=================================')

gbc_2 = GradientBoostingClassifier(n_estimators=140,
                                   learning_rate=0.01,
                                   max_depth=6,
                                   min_samples_split=100,
                                   min_samples_leaf=50,
                                   max_features=7,
                                   subsample=0.8,
                                   random_state=12345)
gbc_2.fit(X_train, y_train)
pred_test_7 = gbc_2.predict(X_test)
proba_test_7 = gbc_2.predict_proba(X_test)[:, 1]
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_7)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_7)))
print('=================================')

gbc_3 = GradientBoostingClassifier(n_estimators=140,
                                   learning_rate=0.01,
                                   max_depth=6,
                                   min_samples_split=100,
                                   min_samples_leaf=50,
                                   max_features=7,
                                   subsample=0.7,
                                   random_state=12345)
gbc_3.fit(X_train, y_train)
pred_test_8 = gbc_3.predict(X_test)
proba_test_8 = gbc_3.predict_proba(X_test)[:, 1]
print('accuracy = {}'.format(metrics.accuracy_score(y_test, pred_test_8)))
print('AUC_score = {}'.format(metrics.roc_auc_score(y_test, proba_test_8)))
print('=================================')
           

继续阅读