天天看點

《Python金融大資料風控模組化實戰》 第17章 內建學習本章引言Python代碼實作及注釋

《Python金融大資料風控模組化實戰》 第17章 內建學習

  • 本章引言
  • Python代碼實作及注釋

本章引言

內建學習旨在通過訓練多個模型,擴充假設空間,進而逐漸接近真實資料集中蘊含的規則。同時,多個訓練模型同時陷入局部最小值的機率較低,保證了測試集可以得到相對較優的結果。

目前,內建學習大緻可分為兩種:并行的內建方法Bagging和串行的內建方法Boosting。并行的內建方法中,基學習器的建構是互相獨立的,沒有先後順序,可以同時進行模組化。而串行的內建方法中,各個基學習器之間有強烈的依賴關系,即後一個模型是在前一個模型的基礎上建立的。內建學習的核心是優勢互補,是以如何增加基學習器的獨立性和多樣性是內建學習的關鍵,不同的算法有不同的政策。

Python代碼實作及注釋

# 第17章:內建學習

import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import variable_encode as var_encode
from sklearn.metrics import confusion_matrix,recall_score, auc, roc_curve,precision_score,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif']=['SimHei']   
matplotlib.rcParams['axes.unicode_minus']=False  
import warnings
warnings.filterwarnings("ignore") ##忽略警告
##資料讀取
def data_read(data_path,file_name):
    df = pd.read_csv( os.path.join(data_path, file_name), delim_whitespace = True, header = None )
    ##變量重命名
    columns = ['status_account','duration','credit_history','purpose', 'amount',
               'svaing_account', 'present_emp', 'income_rate', 'personal_status',
               'other_debtors', 'residence_info', 'property', 'age',
               'inst_plans', 'housing', 'num_credits',
               'job', 'dependents', 'telephone', 'foreign_worker', 'target']
    df.columns = columns
    ##将标簽變量由狀态1,2轉為0,1;0表示好使用者,1表示壞使用者
    df.target = df.target - 1
      ##資料分為data_train和 data_test兩部分,訓練集用于得到編碼函數,驗證集用已知的編碼規則對驗證集編碼
    data_train, data_test = train_test_split(df, test_size=0.2, random_state=0,stratify=df.target)
    return data_train, data_test
##離散變量與連續變量區分   
def category_continue_separation(df,feature_names):
    categorical_var = []
    numerical_var = []
    if 'target' in feature_names:
        feature_names.remove('target')
    ##先判斷類型,如果是int或float就直接作為連續變量
    numerical_var = list(df[feature_names].select_dtypes(include=['int','float','int32','float32','int64','float64']).columns.values)
    categorical_var = [x for x in feature_names if x not in numerical_var]
    return categorical_var,numerical_var
if __name__ == '__main__':
    path = 'D:\\code\\chapter17'
    data_path = os.path.join(path ,'data')
    file_name = 'german.csv'
    ##讀取資料
    data_train, data_test = data_read(data_path,file_name)
    sum(data_train.target ==0)
    data_train.target.sum()
    ##區分離散變量與連續變量
    feature_names = list(data_train.columns)
    feature_names.remove('target')
    categorical_var,numerical_var = category_continue_separation(data_train,feature_names)
    
    ###離散變量直接WOE編碼
    var_all_bin = list(data_train.columns)
    var_all_bin.remove('target')
    ##訓練集WOE編碼
    df_train_woe, dict_woe_map, dict_iv_values ,var_woe_name = var_encode.woe_encode(data_train,data_path,categorical_var, data_train.target,'dict_woe_map', flag='train')
    ##測試集WOE編碼
    df_test_woe, var_woe_name = var_encode.woe_encode(data_test,data_path,categorical_var, data_test.target, 'dict_woe_map',flag='test')
    
    #####連續變量缺失值做填補
    for i in numerical_var:
        if sum(data_train[i].isnull()) >0:
            data_train[i].fillna(data_train[i].mean(),inplace=True)
            

    ###組成分箱後的訓練集與測試集
    data_train.reset_index(drop=True,inplace=True)
    data_test.reset_index(drop=True,inplace=True)
    var_1 = numerical_var
    var_1.append('target')
    data_train_1 = pd.concat([df_train_woe[var_woe_name],data_train[var_1]],axis=1)
    data_test_1 = pd.concat([df_test_woe[var_woe_name],data_test[var_1]],axis=1) 
    
    ####取出訓練資料與測試資料
    var_all = list(data_train_1.columns)
    var_all.remove('target')

    ####變量歸一化
    scaler = StandardScaler().fit(data_train_1[var_all])
    data_train_1[var_all] = scaler.transform(data_train_1[var_all])  
    data_test_1[var_all] = scaler.transform(data_test_1[var_all])

    x_train = np.array(data_train_1[var_all])
    y_train = np.array(data_train_1.target)
    
    x_test = np.array(data_test_1[var_all])
    y_test = np.array(data_test_1.target)
        
   
    ########随機森林模型
    ##設定待優化的超參數
    rf_param = {'n_estimators': list(range(50, 400, 50)),
                'max_depth': list(range(2, 10, 1)),
                'class_weight': [{1: 1, 0: 1}, {1: 2, 0: 1}, {1: 3, 0: 1}]}
    ##初始化網格搜尋
    rf_gsearch = GridSearchCV(estimator=RandomForestClassifier(random_state=0, criterion='entropy',
                                                                max_features=0.8, bootstrap=True),
                              param_grid=rf_param, cv=3, scoring='f1', n_jobs=-1, verbose=2)
    ##執行超參數優化
    rf_gsearch.fit(x_train, y_train)
    print('RandomForest model best_score_ is {0},and best_params_ is {1}'.format(rf_gsearch.best_score_,
                                                                                 rf_gsearch.best_params_))
    ##模型訓練
    ##用最優參數,初始化随機森林模型
    RF_model_2 = RandomForestClassifier(random_state=0, n_jobs=-1, criterion='entropy',
                                        n_estimators=rf_gsearch.best_params_['n_estimators'],
                                        max_depth=rf_gsearch.best_params_['max_depth'],
                                        max_features=0.8,
                                        min_samples_split=50,
                                        class_weight=rf_gsearch.best_params_['class_weight'],
                                        bootstrap=True)
    ##訓練随機森林模型
    RF_model_2_fit = RF_model_2.fit(x_train, y_train)
    
    ##屬性
#    ss = RF_model_2_fit.estimators_
#    RF_model_2_fit.classes_
#    RF_model_2_fit.n_features_
#    RF_model_2_fit.feature_importances_
    
    
    ##模型預測
    y_pred = RF_model_2_fit.predict(x_test)
    ##計算混淆矩陣與recall、precision
    cnf_matrix = confusion_matrix(y_test, y_pred)
    recall_value = recall_score(y_test, y_pred)
    precision_value = precision_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(cnf_matrix)
    print('Validation set:  model recall is {0},and percision is {1}'.format(recall_value,
                 precision_value)) 
    
   
    ##變量重要性排序
    ##取出變量重要性結果
    len_1 = len(RF_model_2.feature_importances_)
    var_name = [str(x).split('_woe')[0] for x in var_all]
    ##重要性繪圖
    plt.figure(figsize=(10,6))
    fontsize_1 = 14
    plt.barh(np.arange(0,20),RF_model_2.feature_importances_,color =
     'c',tick_label=var_name)
    plt.xticks( fontsize=fontsize_1)
    plt.yticks( fontsize=fontsize_1)
    plt.show()

    
    ###Adaboost模型
    ##設定待優化的超參數
    ada_param = {'n_estimators': list(range(50, 500, 50)),
                'learning_rate': list(np.arange(0.1, 1, 0.2))}
    ##初始化網格搜尋
    ada_gsearch = GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME.R',random_state=0),
                              param_grid=ada_param, cv=3,  n_jobs=-1, verbose=2)
     ##執行超參數優化
    ada_gsearch.fit(x_train, y_train)
    print('AdaBoostClassifier model best_score_ is {0},and best_params_ is {1}'.format(ada_gsearch.best_score_,
                                                                                 ada_gsearch.best_params_))
    ##模型訓練
    ##用最優參數,初始化Adaboost模型
    ada_model_2 = AdaBoostClassifier( n_estimators=ada_gsearch.best_params_['n_estimators'],
                                        learning_rate=ada_gsearch.best_params_['learning_rate'],
                                        algorithm='SAMME.R',random_state=0)
    ##訓練adaboost模型
    ada_model_2_fit = ada_model_2.fit(x_train, y_train)
    
    
    ##模型預測
    y_pred = ada_model_2_fit.predict(x_test)
    ##計算混淆矩陣與recall、precision
    cnf_matrix = confusion_matrix(y_test, y_pred)
    recall_value = recall_score(y_test, y_pred)
    precision_value = precision_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(cnf_matrix)
    print('Validation set:  model recall is {0},and percision is {1}'.format(recall_value,
                 precision_value)) 
    
    ###檢視模型訓練過程準确率的變化
    ##取出訓練過程中模型得分
    n_estimators = ada_gsearch.best_params_['n_estimators']
    per_train = list(ada_model_2_fit.staged_score(x_train, y_train))
    per_test= list(ada_model_2_fit.staged_score(x_test, y_test))
    ##預測準确率繪圖
    plt.figure(figsize=(10,6))
    fontsize_1 = 14
    plt.plot(np.arange(0,n_estimators),per_train,'--',color ='c',label='訓練集')
    plt.plot(np.arange(0,n_estimators),per_test,':',color ='b',label='測試集')
    plt.xticks( fontsize=fontsize_1)
    plt.yticks( fontsize=fontsize_1)
    plt.xlabel('n_estimators',fontsize=fontsize_1)
    plt.ylabel('score',fontsize=fontsize_1)
    plt.legend(fontsize=fontsize_1)
    plt.show()
    
    ####GBDT模型
    ##設定待優化的超參數
    gbdt_param = {'n_estimators': list(range(50, 400, 50)),
                'max_depth': list(range(2, 5, 1)),
                'learning_rate': list(np.arange(0.01, 0.5, 0.05)) }
    ##初始化網格搜尋
    gbdt_gsearch = GridSearchCV(estimator=GradientBoostingClassifier( subsample=0.8,max_features=0.8, validation_fraction=0.1, 
                                                                   n_iter_no_change =3,random_state=0),param_grid=gbdt_param, 
                                                                   cv=3, scoring='f1', n_jobs=-1, verbose=2)
    ##執行超參數優化
    gbdt_gsearch.fit(x_train, y_train)
    print('gbdt model best_score_ is {0},and best_params_ is {1}'.format(gbdt_gsearch.best_score_,
                                                                                 gbdt_gsearch.best_params_))
    ##模型訓練
    ##用最優參數,初始化GBDT模型
    GBDT_model= GradientBoostingClassifier(subsample=0.8,max_features=0.8, validation_fraction=0.1, 
                                                      n_iter_no_change =3,random_state=0 ,
                                        n_estimators=gbdt_gsearch.best_params_['n_estimators'],
                                        max_depth=gbdt_gsearch.best_params_['max_depth'],
                                        learning_rate=gbdt_gsearch.best_params_['learning_rate'])
    ##訓練GBDT模型
    GBDT_model_fit = GBDT_model.fit(x_train, y_train)
    
    
    ###看一下混沌矩陣
    y_pred = GBDT_model_fit.predict(x_test)
    cnf_matrix = confusion_matrix(y_test, y_pred)
    recall_value = recall_score(y_test, y_pred)
    precision_value = precision_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(cnf_matrix)
    print('Validation set:  model recall is {0},and percision is {1}'.format(recall_value,
                 precision_value)) 
    
    ###xgboost模型
    ##設定待優化的超參數
    xgb_param = {'max_depth': list(range(2, 6, 1)), 'min_child_weight': list(range(1, 4, 1)),
                 'learning_rate': list(np.arange(0.01, 0.3, 0.05)), 'scale_pos_weight': list(range(1, 5, 1))}
    ##初始化網格搜尋
    xgb_gsearch = GridSearchCV(
        estimator=XGBClassifier(random_state=0, n_estimators=500, subsample=0.8, colsample_bytree=0.8),
        param_grid=xgb_param, cv=3, scoring='f1', n_jobs=-1, verbose=2)
    ##執行超參數優化
    xgb_gsearch.fit(x_train, y_train)
    print('xgboost model best_score_ is {0},and best_params_ is {1}'.format(xgb_gsearch.best_score_,
                                                                          xgb_gsearch.best_params_))
    ##用最優參數,初始化xgboost模型
    xgboost_model = XGBClassifier(random_state=0, n_jobs=-1,
                                    n_estimators=500,
                                    max_depth=xgb_gsearch.best_params_['max_depth'],
                                    subsample=0.8, colsample_bytree=0.8,
                                    learning_rate=xgb_gsearch.best_params_['learning_rate'],
                                    scale_pos_weight=xgb_gsearch.best_params_['scale_pos_weight'])
    ##訓練xgboost模型
    xgboost_model_fit = xgboost_model.fit(x_train, y_train)

    
    ##模型預測
    y_pred = xgboost_model_fit.predict(x_test)
    ##計算混淆矩陣與recall、precision
    cnf_matrix = confusion_matrix(y_test, y_pred)
    recall_value = recall_score(y_test, y_pred)
    precision_value = precision_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(cnf_matrix)
    print('Validation set:  model recall is {0},and percision is {1}'.format(recall_value,
                 precision_value)) 
    
    ##給出機率預測結果
    y_score_test = xgboost_model_fit.predict_proba(x_test)[:, 1]
    ##計算AR。gini等
    fpr, tpr, thresholds = roc_curve(y_test, y_score_test)
    roc_auc = auc(fpr, tpr)
    ks = max(tpr - fpr)
    ar = 2*roc_auc-1
    print('test set:  model AR is {0},and ks is {1},auc={2}'.format(ar,
                 ks,roc_auc)) 
    ####ks曲線
    plt.figure(figsize=(10,6))
    fontsize_1 = 12
    plt.plot(np.linspace(0,1,len(tpr)),tpr,'--',color='black')
    plt.plot(np.linspace(0,1,len(tpr)),fpr,':',color='black')
    plt.plot(np.linspace(0,1,len(tpr)),tpr - fpr,'-',color='grey')
    plt.grid()
    plt.xticks( fontsize=fontsize_1)
    plt.yticks( fontsize=fontsize_1)
    plt.xlabel('機率分組',fontsize=fontsize_1)
    plt.ylabel('累積占比%',fontsize=fontsize_1)