Kaggle裡Xgboost是比較流行的一個模型,跟GBM,Adaboost類似,可以用于二分類或多分類,通過将多個弱分類器組合為強分類器提高精度,支援多線程。做完基本的特征工程後,使用Xgboost訓練模型來預測,暫時還沒有參數調優:
# coding=gbk
'''
Created on 2016年8月23日
@author: oul
'''
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import
train_test_split
import xgboost as xgb
import random
from operator import itemgetter
import zipfile
from sklearn.metrics import roc_auc_score
import time
random.seed(3)
training_filename =
'C:\\Users\\ganda\\Desktop\\train.csv'
testing_filename =
'C:\\Users\\ganda\\Desktop\\test.csv'
submission_dir = 'C:\\Users\\ganda\\Desktop\\'
def load_train():
print("Loading
training data csv: " + training_filename)
training_data =
pd.read_csv(training_filename)
return
training_data
def load_test():
#print("Loading
training data csv: " + training_filename)
test =
pd.read_csv(testing_filename)
# TODO
return
test
def create_feature_map(features):
outfile =
open('xgb.fmap', 'w')
for i, feat in
enumerate(features):
outfile.write('{0}\t{1}\tq\n'.format(i,
feat))
outfile.close()
def get_importance(gbm, features):
create_feature_map(features)
importance =
gbm.get_fscore(fmap='xgb.fmap')
importance =
sorted(importance.items(), key=itemgetter(1),
reverse=True)
return
importance
def print_features_importance(imp):
for i in
range(len(imp)):
print("# " + str(imp[i][1]))
print('output.remove(\'' + imp[i][0] +
'\')')
def run_XGBoost(train, test,
features,target,random_state=0):
eta = 0.1
max_depth =
5
subsample =
0.8
colsample_bytree =
0.8
start_time =
time.time()
print('XGBoost
params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE:
{}'.format(eta, max_depth, subsample, colsample_bytree))
params =
{
"objective": "binary:logistic",
"booster" : "gbtree",
"eval_metric": "auc",
"eta": eta,
"max_depth": max_depth,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"silent": 1,
"seed": random_state
}
num_boost_round =
260
early_stopping_rounds = 20
test_size =
0.1
X_train, X_valid =
train_test_split(train, test_size=test_size,
random_state=random_state)
y_train =
X_train[target]
y_valid =
X_valid[target]
#添加缺失資料
dtrain =
xgb.DMatrix(X_train[features],y_train,missing = -9999)
dvalid =
xgb.DMatrix(X_valid[features],y_valid,missing = -9999)
watchlist =
[(dtrain, 'train'), (dvalid, 'eval')]
gbm =
xgb.train(params, dtrain, num_boost_round, evals=watchlist,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=True)
print("Validating...")
check =
gbm.predict(xgb.DMatrix(X_valid[features]),
ntree_limit=gbm.best_ntree_limit)
score =
roc_auc_score(y_valid.values,check)
print('Check error
value: {:.6f}'.format(score))
imp =
get_importance(gbm, features)
print('Importance
array: ', imp)
print("Predict test
set...")
test_prediction =
gbm.predict(xgb.DMatrix(test[features]),
ntree_limit=gbm.best_ntree_limit)
print('Training
time: {} minutes'.format(round((time.time() - start_time)/60,
2)))
return
test_prediction.tolist(), score
def create_submission(score, test, prediction):
# Make
Submission
now =
datetime.datetime.now()
sub_file =
submission_dir + 'submission_' + str(score) + '_' +
str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print('Writing
submission: ', sub_file)
f = open(sub_file,
'w')
f.write('id,probability\n')
total = 0
for id in
test['id']:
str1 = str(id) + ',' +
str(prediction[total])
str1 += '\n'
total += 1
f.write(str1)
f.close()
train = load_train()
test = load_test()
features = []
# TODO!
#添加特征
features.extend([''])
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features),
sorted(features)))
#label是目标字段
test_prediction, score = run_XGBoost(train, test,
features,'label')
print('Score = {}'.format(score))
create_submission(score, test, test_prediction)