作者:魚佬,武漢大學碩士
2022科大訊飛:電信客戶流失預測挑戰賽
賽題介紹
随着市場飽和度的上升,電信營運商的競争也越來越激烈,電信營運商亟待解決減少使用者流失,延長使用者生命周期的問題。對于客戶流失率而言,每增加5%,利潤就可能随之降低25%-85%。是以,如何減少電信使用者流失的分析與預測至關重要。
鑒于此,營運商會經常設有客戶服務部門,該部門的職能主要是做好客戶流失分析,赢回高機率流失的客戶,降低客戶流失率。某電信機構的客戶存在大量流失情況,導緻該機構的使用者量急速下降。面對如此頭疼的問題,該機構将部分客戶資料開放,誠邀大家幫助他們建立流失預測模型來預測可能流失的客戶。
賽題任務
給定某電信機構實際業務中的相關客戶資訊,包含69個與客戶相關的字段,其中“是否流失”字段表明客戶會否會在觀察日期後的兩個月内流失。任務目标是通過訓練集訓練模型,來預測客戶是否會流失,以此為依據開展工作,提高使用者留存。
賽題資料
賽題資料由訓練集和測試集組成,總資料量超過25w,包含69個特征字段。為了保證比賽的公平性,将會從中抽取15萬條作為訓練集,3萬條作為測試集,同時會對部分字段資訊進行脫敏。
特征字段
客戶ID、地理區域、是否雙頻、是否翻新機、目前手機價格、手機網絡功能、婚姻狀況、家庭成人人數、資訊庫比對、預計收入、信用卡訓示器、目前裝置使用天數、在職總月數、家庭中唯一訂閱者的數量、家庭活躍使用者數、....... 、過去六個月的平均每月使用分鐘數、過去六個月的平均每月通話次數、過去六個月的平均月費用、是否流失
評分标準
from sklearn import metrics
auc = metrics.roc_auc_score(data['default_score_true'], data['default_score_pred'])
賽題baseline
導入子產品
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Word2Vec
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
資料預處理
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data = pd.concat([train, test], axis=0, ignore_index=True)
訓練資料/測試資料準備
features = [f for f in data.columns if f not in ['是否流失','客戶ID']]
train = data[data['是否流失'].notnull()].reset_index(drop=True)
test = data[data['是否流失'].isnull()].reset_index(drop=True)
x_train = train[features]
x_test = test[features]
y_train = train['是否流失']
構模組化型
def cv_model(clf, train_x, train_y, test_x, clf_name):
folds = 5
seed = 2022
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2 ** 5,
'lambda_l2': 10,
'feature_fraction': 0.7,
'bagging_fraction': 0.7,
'bagging_freq': 10,
'learning_rate': 0.2,
'seed': 2022,
'n_jobs':-1
}
model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
categorical_feature=[], verbose_eval=3000, early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
if clf_name == "xgb":
train_matrix = clf.DMatrix(trn_x , label=trn_y)
valid_matrix = clf.DMatrix(val_x , label=val_y)
test_matrix = clf.DMatrix(test_x)
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.2,
'tree_method': 'exact',
'seed': 2020,
'nthread': 36,
"silent": True,
}
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=3000, early_stopping_rounds=200)
val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
if clf_name == "cat":
params = {'learning_rate': 0.2, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=20000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
cat_features=[], use_best_model=True, verbose=3000)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
train[valid_index] = val_pred
test = test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))
print(cv_scores)
print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test
def lgb_model(x_train, y_train, x_test):
lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
return lgb_train, lgb_test
def xgb_model(x_train, y_train, x_test):
xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
return xgb_train, xgb_test
def cat_model(x_train, y_train, x_test):
cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
return cat_train, cat_test
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
送出結果
test['是否流失'] = lgb_test
test[['客戶ID','是否流失']].to_csv('test_sub.csv', index=False)