天天看點

xgboost python分類_XGBoost多分類預測

import pandas as pd

from sklearn.model_selection import train_test_split

from xgboost.sklearn import XGBClassifier

from sklearn.metrics import classification_report

from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.externals import joblib # 将模型導出所需包

def get_cust_age_stage(birth_year):

"""根據出生年份擷取年齡段"""

age_stage = []

for i in range(len(birth_year)):

if int(birth_year[i]) == 0:

age_stage.append("未知")

elif int(birth_year[i]) < 1960:

age_stage.append("60前")

elif int(birth_year[i]) < 1970:

age_stage.append("60後")

elif int(birth_year[i]) < 1980:

age_stage.append("70後")

elif int(birth_year[i]) < 1990:

age_stage.append("80後")

elif int(birth_year[i]) < 2000:

age_stage.append("90後")

elif int(birth_year[i]) >= 2000:

age_stage.append("00後")

else:

age_stage.append("未知")

return age_stage

def get_top5_onehot(data):

"""對c字段排名top5的進行one hot"""

# 擷取top5的值

c_top5_counts = data['c'].value_counts()[:5]

c_top5_names = list(c_top5_counts.keys())

# 進行one-hot編碼,隻保留top5的列

c_one_hot = pd.get_dummies(data['c'])

c_top5 = c_one_hot[c_top5_names]

# 将top5的列合并到data中

data = data.join(c_top5)

return data

def get_quantile_20_values(input_data):

"""按照分位數切分為20等分"""

grade = pd.DataFrame(columns=['quantile', 'value'])

for i in range(0, 21):

grade.loc[i, 'quantile'] = i / 20.0

grade.loc[i, 'value'] = input_data.quantile(i / 20.0)

cut_point = grade['value'].tolist() # 20等分的分位數的值

# 對20等分的分位數的值 進行去重

s_unique = []

for i in range(len(cut_point)):

if cut_point[i] not in s_unique:

s_unique.append(cut_point[i])

return s_unique

def get_quantile_interregional(s_unique):

"""根據去重後的分位數,構造區間"""

interregional = []

for i in range(1, len(s_unique)):

interregional.append([i, s_unique[i - 1], s_unique[i]])

if i == len(s_unique) - 1 and len(interregional) < 20:

interregional.append([i + 1, s_unique[i], s_unique[i]])

return interregional

def get_current_level(item_data,interregional):

"""根據分位數區間擷取目前數所對應的的級别"""

level = 0

for i in range(len(interregional)):

if item_data >= interregional[i][1] and item_data

level = interregional[i][0]

break

elif interregional[i][1] == interregional[i][2]:

level = interregional[i][0]

break

return level

def get_division_level(input_data):

"""根據分位數劃分對應級别"""

# 擷取去重後20等分的分位數的值

s_unique = get_quantile_20_values(input_data)

# 構造分位數區間,輸出格式[index,下限,上限] 區間為左閉右開

interregional = get_quantile_interregional(s_unique)

# 根據分位數區間對資料劃分不同等級

quantile_20_level = []

for item in input_data:

quantile_20_level.append(get_current_level(item, interregional))

return quantile_20_level

def pre_processing(data):

"""對資料進行預處理"""

# 1. 增加衍生變量

# 年齡

data['年齡'] = get_cust_age_stage(data['出生年份'])

# 本月平均時長

data['本月平均時長'] = data['本月時長'].div(data['本月次數'],axis=0)

data['g'] = data['a'] - data['b']

# 2. 填充資料

col_name_0 = ['a', 'b','g', 'k'] # 需要填充為數字0的名額名

values = {}

for i in col_name_0:

values[i] = 0

# 不加inplace=True,資料不會被填充

data.fillna(value=values, inplace=True)

data.fillna({'m':'未知', 'z':'未知'}, inplace=True) # m/z列需要填充為字元串

# 對c名額進行one-hot處理

data = get_top5_onehot(data)

# 3. 分級化

col_name_level = ['d', 'e', 'f']

for i in range(len(col_name_level)):

new_col_name = col_name_level[i] + "_TILE20"

data[new_col_name] = get_division_level(data[col_name_level[i]])

return data

def get_model_columns(input_data):

"""擷取模組化名額列名,清單類型"""

total_col_names = input_data.columns

del_col_names = ['a','b','c']

model_col_names = [i for i in total_col_names if i not in del_col_names]

return model_col_names

def importance_features_top(model_str, model, x_train):

"""列印模型的重要名額,排名top10名額"""

print("列印XGBoost重要名額")

feature_importances_ = model.feature_importances_

feature_names = x_train.columns

importance_col = pd.DataFrame([*zip(feature_names, feature_importances_)],

columns=['a', 'b'])

importance_col_desc = importance_col.sort_values(by='b', ascending=False)

print(importance_col_desc.iloc[:10, :])

def print_precison_recall_f1(y_true, y_pre):

"""列印精準率、召回率和F1值"""

print("列印精準率、召回率和F1值")

print(classification_report(y_true, y_pre))

f1 = round(f1_score(y_true, y_pre, average='macro'), 2)

p = round(precision_score(y_true, y_pre, average='macro'), 2)

r = round(recall_score(y_true, y_pre, average='macro'), 2)

print("Precision: {}, Recall: {}, F1: {} ".format(p, r, f1))

def xgboost_model(x_train,y_train):

"""用XGBoost進行模組化,傳回訓練好的模型"""

xgboost_clf = XGBClassifier(min_child_weight=6,max_depth=15,

objective='multi:softmax',num_class=5)

print("-" * 60)

print("xgboost模型:", xgboost_clf)

xgboost_clf.fit(x_train, y_train)

# # 列印重要性指數

importance_features_top('xgboost', xgboost_clf, x_train)

# 儲存模型

joblib.dump(xgboost_clf, './model/XGBoost_model_v1.0')

return xgboost_clf

filename = "./檔案對應路徑.xlsx"

data = pd.read_excel(filename)

# 資料預處理,包括填充資料,增加衍生變量、分級化、top打橫

data_processed = pre_processing(data)

# 根據業務删除某些變量,擷取模組化所需名額

model_col_names = get_model_columns(input_data)

model_data = data_processed[model_col_names]

# 将資料拆分為輸入資料和輸出資料

data_y = model_data['label']

data_x = model_data.drop(['label'], axis=1)

# 資料集拆分為訓練集和測試集兩部分 使用随機數種子,確定可以複現

x_train, x_test, y_train, y_test = train_test_split(data_x,data_y,

test_size=0.3,random_state=1)

# 模組化

xgboost_clf = xgboost_model(x_train, y_train)

# 預測

pre_y_test = xgboost_clf.predict(x_test)

# 列印測試集的結果資訊,包含precision、recall、f1-socre

print("-" * 30, "測試集", "-" * 30)

print_precison_recall_f1(y_test, pre_y_test)