機器學習項目實戰之使用者流失預警

from __future__ import division
import pandas as pd
import numpy as np

churn_df = pd.read_csv("D:\\test\\machineLearning\\churn.csv")
col_names = churn_df.columns.tolist()

print "Column_names:"
print col_names

to_show = col_names[:]+col_names[-:]
print "\nSample_data:"
churn_df[to_show].head()

Column_names: [‘State’, ‘Account Length’, ‘Area Code’, ‘Phone’, “Int’l Plan”, ‘VMail Plan’, ‘VMail Message’, ‘Day Mins’, ‘Day Calls’, ‘Day Charge’, ‘Eve Mins’, ‘Eve Calls’, ‘Eve Charge’, ‘Night Mins’, ‘Night Calls’, ‘Night Charge’, ‘Intl Mins’, ‘Intl Calls’, ‘Intl Charge’, ‘CustServ Calls’, ‘Churn?’] Sample_data:

State	Account Length	Area Code	Phone	Int’l Plan	VMail Plan	Night Charge	Intl Mins	Intl Calls	Intl Charge	CustServ Calls	Churn?
KS	128	415	382-4657	no	yes	11.01	10.0	3	2.70	1	False.
1	OH	107	415	371-7191	no	yes	11.45	13.7	3	3.70	1	False.
2	NJ	137	415	358-1921	no	no	7.32	12.2	5	3.29	False.

#将字元改變成數值，便于分析
#Churn是客戶量流失的意思
churn_result = churn_df["Churn?"]
y = np.where(churn_result == 'True.',,)

#去掉一些特征
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=)

#将這些yes和no轉化為布爾值
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'

feaures = churn_feat_space.columns

X = churn_feat_space.as_matrix().astype(np.float)

#重點:不同的特征項有不同的值，如1-2，3萬到4萬，不同特征間的數值上的巨大差異會影響我們的分析
#例如作圖的時候，是以我們需要統一将這些資料壓縮到一定的區間上
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print "Feature space holds %d observations and %d features"% X.shape
print "Unique target labels:",np.unique(y)
print X[]
print len(y[y == ])

Feature space holds 3333 observations and 17 features Unique target labels: [0 1] [ 0.67648946 -0.32758048 1.6170861 1.23488274 1.56676695 0.47664315 1.56703625 -0.07060962 -0.05594035 -0.07042665 0.86674322 -0.46549436 0.86602851 -0.08500823 -0.60119509 -0.0856905 -0.42793202] 2850

from sklearn.cross_validation import KFold

#交叉驗證函數：X是特征資料，y是label，clf_class是你選擇的分類器，kwargs指定的參數
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

from sklearn.svm import SVC  #支援向量機
from sklearn.ensemble import RandomForestClassifier as RF  #随機森林
from sklearn.neighbors import KNeighborsClassifier as KNN  #k最近鄰

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

#嘗試使用多種分類器來驗證效果
print "Support vector machines:"
print "%.3f" % accuracy(y, run_cv(X,y,SVC))
print "Random forest:"
print "%.3f" % accuracy(y, run_cv(X,y,RF))
print "K-nearest-neighbors:"
print "%.3f" % accuracy(y, run_cv(X,y,KNN))

Support vector machines: 0.913 Random forest: 0.942 K-nearest-neighbors: 0.897

#以上的準确率的意義并不大，對于客戶來說，重要的是ROC名額FN，即我預測錯了，認為客戶不會流失，但是客戶流失了

from sklearn.cross_validation import KFold

#交叉驗證函數：X是特征資料，y是label，clf_class是你選擇的分類器，kwargs指定的參數
def run_prob_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=,shuffle=True)
    y_prob = np.zeros((len(y),))

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob

import warnings
warnings.filterwarnings('ignore')

pred_prob = run_prob_cv(X,y,RF,n_estimators=)

pred_churn = pred_prob[:,]
is_churn = y == 

counts = pd.value_counts(pred_churn)

true_prob = {}
for prob in counts.index:
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    true_prob = pd.Series(true_prob)

counts = pd.concat([counts,true_prob],axis=).reset_index()
counts.columns = ["pred_prob","count","true_prob"]
counts
#通過觀測以下資料進行預警，當實際的可能性是百分之30或者40時，對應的真實的使用者流失情況，由使用者選擇門檻值進行預警

pred_prob	count	true_prob
0.0	1779	0.029230
1	0.1	696	0.020115
2	0.2	265	0.060377
3	0.3	126	0.142857
4	0.8	91	0.978022
5	0.9	75	0.960000
6	0.4	73	0.438356
7	0.7	65	0.953846
8	0.5	57	0.561404
9	1.0	56	0.982143
10	0.6	50	0.820000

機器學習項目實戰之使用者流失預警

繼續閱讀

簡單文檔分類——樸素貝葉斯算法樸素貝葉斯算法簡單文檔分類執行個體步驟總結樸素貝葉斯分類調用(sklearn)

【分類算法】什麼是分類算法定義分類與聚類分類過程方法

分類算法的評價名額

K-近鄰算法以及圖像分類應用

weka之NB算法

使用weka的select attribute

weka中分類器算法

在weka中內建自己的算法

【多變量線性回歸】學習記錄序思路實作終

申請評分模型拒絕推斷（RI）方法申請評分模型拒絕推斷（RI）方法

【人工智能行業大師訪談1】吳恩達采訪 Geoffery Hinton

【趨高機器視覺】機器視覺技術原了解析及解決方案

吳恩達 coursera ML 第七課總結+作業答案前言目錄正文模型表示作業答案

XGBoost Plotting API以及GBDT組合特征實踐 XGBoost Plotting API以及GBDT組合特征實踐

解碼器用于語義分割：資料依賴的解碼可以實作靈活的特征聚合

2021-2025年中國運動療法（KT）帶行業市場供需與戰略研究報告