天天看點

機器學習項目實戰之使用者流失預警

from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")
col_names = churn_df.columns.tolist()

print "Column_names:"
print col_names

to_show = col_names[:]+col_names[-:]
print "\nSample_data:"
churn_df[to_show].head()
           

Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:

State Account Length Area Code Phone Int’l Plan VMail Plan Night Charge Intl Mins Intl Calls Intl Charge CustServ Calls Churn?
KS 128 415 382-4657 no yes 11.01 10.0 3 2.70 1 False.
1 OH 107 415 371-7191 no yes 11.45 13.7 3 3.70 1 False.
2 NJ 137 415 358-1921 no no 7.32 12.2 5 3.29 False.
#将字元改變成數值,便于分析
#Churn是客戶量流失的意思
churn_result = churn_df["Churn?"]
y = np.where(churn_result == 'True.',,)

#去掉一些特征
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=)

#将這些yes和no轉化為布爾值
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

feaures = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

#重點:不同的特征項有不同的值,如1-2,3萬到4萬,不同特征間的數值上的巨大差異會影響我們的分析
#例如作圖的時候,是以我們需要統一将這些資料壓縮到一定的區間上
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features"% X.shape
print "Unique target labels:",np.unique(y)
print X[]
print len(y[y == ])
           

Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850

from sklearn.cross_validation import KFold

#交叉驗證函數:X是特征資料,y是label,clf_class是你選擇的分類器,kwargs指定的參數
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
           
from sklearn.svm import SVC  #支援向量機
from sklearn.ensemble import RandomForestClassifier as RF  #随機森林
from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近鄰

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

#嘗試使用多種分類器來驗證效果
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))
           

Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897

#以上的準确率的意義并不大,對于客戶來說,重要的是ROC名額FN,即我預測錯了,認為客戶不會流失,但是客戶流失了

from sklearn.cross_validation import KFold

#交叉驗證函數:X是特征資料,y是label,clf_class是你選擇的分類器,kwargs指定的參數
def run_prob_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=,shuffle=True)
    y_prob = np.zeros((len(y),))

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob
           
import warnings
warnings.filterwarnings('ignore')

pred_prob = run_prob_cv(X,y,RF,n_estimators=)

pred_churn = pred_prob[:,]
is_churn = y == 

counts = pd.value_counts(pred_churn)

true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob],axis=).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts
#通過觀測以下資料進行預警,當實際的可能性是百分之30或者40時,對應的真實的使用者流失情況,由使用者選擇門檻值進行預警
           
pred_prob count true_prob
0.0 1779 0.029230
1 0.1 696 0.020115
2 0.2 265 0.060377
3 0.3 126 0.142857
4 0.8 91 0.978022
5 0.9 75 0.960000
6 0.4 73 0.438356
7 0.7 65 0.953846
8 0.5 57 0.561404
9 1.0 56 0.982143
10 0.6 50 0.820000

繼續閱讀