天天看點

推薦模型LightGBM

import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
import time
import datetime

from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
import re
from keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import tensorflow.keras.backend as K
from keras.optimizers import *
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.backend import cast
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
np.random.seed(1024)
rn.seed(1024)

import warnings
warnings.filterwarnings('ignore')
           
/home/frank/miniconda3/envs/reco2/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
           
user_log_acct item_sku_id action_time action_type brand_code shop_id item_third_cate_cd vender_id shop_score age sex user_level province city county
937922 357022 2020-02-04 08:28:15 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0
1 937922 73 2020-02-04 08:27:07 1 1791.0 8703.0 10.0 5227.0 -1.000000 5.0 1.0 5 11.0 348.0 1782.0
2 937922 29583 2020-02-04 08:26:31 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0
3 937922 108763 2020-02-04 08:26:10 1 1791.0 2738.0 10.0 3436.0 9.206167 5.0 1.0 5 11.0 348.0 1782.0
4 1369473 331139 2020-02-03 21:55:49 1 9985.0 6367.0 73.0 3666.0 0.000000 5.0 1.0 5 1.0 41.0 2058.0
action_data.shape
           
(37214269, 15)
           

資料預處理

# 存在異常值,需要修改
action_data['dd_len'] = action_data['action_time'].apply(lambda x: len(str(x)))
action_data['action_time'] = action_data['action_time'].apply(lambda x: x[:19])
del action_data['dd_len']
           
action_data['action_time'] = pd.to_datetime(action_data['action_time'])
action_data = action_data.sort_values('action_time')
           
action_data['month'] = action_data['action_time'].dt.month
action_data['day'] = action_data['action_time'].dt.day
action_data['month_day'] = action_data['month'].values * 100 + action_data['day'].values
           

訓練集切分

def _label_trans(x, dic_):
    try:
        return dic_[x]
    except:
        return 0
           
def get_label(df, label_st = (4,11), label_en = (4,15),candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10)):
    # label_st ->label_en             -->  标簽集  
    # candidate_st + candidate_en     -->  候選集
    # <= fea_en                       -->  特征
    
    lb_st = df.loc[(df['month'] == label_st[0]) & (df['day'] == label_st[1]),   'month_day'].values[0]
    lb_en = df.loc[(df['month'] == label_en[0]) & (df['day'] == label_en[1]),   'month_day'].values[0]   
    
    cand_st = df.loc[(df['month'] == candidate_st[0]) & (df['day'] == candidate_st[1]),   'month_day'].values[0]
    cand_en = df.loc[(df['month'] == candidate_en[0]) & (df['day'] == candidate_en[1]),   'month_day'].values[0] 
    
    fea_position = df.loc[(df['month'] == fea_en[0]) & (df['day'] == fea_en[1]),   'month_day'].values[0]    
    ind_label = (df['month_day']>= lb_st) & (df['month_day']<= lb_en) & (df['action_type'] ==2)
    ind_candidate = (df['month_day']>= cand_st) & (df['month_day']<= cand_en)
    ind_fea = (df['month_day']<= fea_position)     
    data_label = df.loc[ind_label].copy()
    data_fea = df.loc[ind_fea].copy() # 用來建構特征集合
    data_candidates = df.loc[ind_candidate].copy() 
    
    # 建構候選集
    df_candidates  =  data_candidates[['user_log_acct','item_sku_id']].copy()    
    df_candidates  =  df_candidates.drop_duplicates(subset = ['user_log_acct','item_sku_id']) 
    df_candidates  =  df_candidates.loc[(df_candidates.item_sku_id.isnull() == False)]
    
    # 建構标簽 
    label = data_label[['user_log_acct','item_sku_id','day']].copy() 
    print('get label')
    
    # 打标簽 
    df_candidates['label_cnt'] = 0 
    df_candidates['label_days'] = 0 
    df_candidates['user_item'] = df_candidates['user_log_acct'].astype(str)+'_' + df_candidates['item_sku_id'].astype(str)
    label['user_item'] = label['user_log_acct'].astype(str)+'_' + label['item_sku_id'].astype(str)
    dic_cnt  = label['user_item'].value_counts().to_dict()
    dic_days = label.groupby('user_item')['day'].nunique().to_dict()  
    df_candidates['label_cnt'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_cnt)).values
    df_candidates['label_days'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_days)).values
    
    return df_candidates, data_fea
           
%%time
df_valid_label,data_valid_fea = get_label(action_data, label_st = (4,11), label_en = (4,15), candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10))
           
get label
CPU times: user 5.44 s, sys: 948 ms, total: 6.39 s
Wall time: 6.39 s
           
%%time
df_train_label1,data_train_fea1 = get_label(action_data, label_st = (4,6), label_en = (4,10), candidate_st = (4,1), candidate_en = (4,5), fea_en = (4,5))
           
get label
CPU times: user 4.81 s, sys: 616 ms, total: 5.43 s
Wall time: 5.43 s
           
user_log_acct item_sku_id label_cnt label_days user_item
34296301 1144603 153700 1144603_153700
1415203 1129253 327893 1129253_327893
3960663 736788 201003 736788_201003
5158969 109461 256490 109461_256490
7377193 470525 142823 470525_142823

特征建構

原始特征

## 原始特征
my_user = action_data[['user_log_acct','age','sex','user_level','province','city','county']].drop_duplicates(['user_log_acct'], keep='first')
my_item = action_data[['item_sku_id','brand_code','shop_id','item_third_cate_cd','vender_id','shop_score']].drop_duplicates(['item_sku_id'], keep='first')
           

user特征

def gen_action_freq_feats(df, start_date):
    
    key = ['user_log_acct']
    action = df[key+['action_type', 'action_time']].copy()
    feats = pd.DataFrame(action[key].drop_duplicates())
    
    for w in tqdm([1, 3, 5, 7, 15, 30]): 
        bef_start_date = start_date - datetime.timedelta(days=w) 
        
        action_cl = action[action['action_time']>=bef_start_date].copy()
        df = pd.get_dummies(action_cl['action_type'], prefix='_'.join(key)+'_last{}_days_action'.format(w))
        action_cl = pd.concat([action_cl, df], axis=1)
        action_cl = action_cl.groupby(key, as_index=False).sum()
        action_cl['_'.join(key)+'_last{}_days_action_1_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_1'.format(w)])
        action_cl['_'.join(key)+'_last{}_days_action_3_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_3'.format(w)])
        action_cl['_'.join(key)+'_last{}_days_action_4_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_4'.format(w)])

        del action_cl['action_type']
        feats = feats.merge(action_cl, on=key, how='left')
    return feats

u_fea_train1 = gen_action_freq_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
u_fea_val1   = gen_action_freq_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
           
100%|██████████| 6/6 [00:05<00:00,  1.15it/s]
100%|██████████| 6/6 [00:05<00:00,  1.06it/s]
           

合并特征集

u_fea_cols1    = [col for col in u_fea_train1.columns if col not in ['user_log_acct']]  
u_fea_cols2    = [col for col in my_user.columns if col not in ['user_log_acct']]  
i_fea_cols     = [col for col in my_item.columns if col not in ['item_sku_id']] 

train_cols     = ['user_log_acct','item_sku_id'] + u_fea_cols1 + u_fea_cols2 + i_fea_cols
           

訓練集&驗證集

# 訓練集
df_train =  df_train_label1.merge(u_fea_train1, on ='user_log_acct', how='left') 
df_train =  df_train.merge(my_user, on ='user_log_acct', how='left')
df_train =  df_train.merge(my_item, on ='item_sku_id', how='left') 

df_train['label'] =  df_train['label_cnt'] > 0
df_train['label'] =  df_train['label'].astype(int)

# 驗證集
df_val =  df_valid_label.merge(u_fea_val1, on ='user_log_acct', how='left') 
df_val =  df_val.merge(my_user, on ='user_log_acct', how='left')
df_val =  df_val.merge(my_item, on ='item_sku_id', how='left') 

df_val['label'] =  df_val['label_cnt'] > 0
df_val['label'] =  df_val['label'].astype(int)
           

序列化

def set_tokenizer(docs, split_char=' ', max_len=100):
    '''
    輸入
    docs:文本清單
    split_char:按什麼字元切割
    max_len:截取的最大長度
    
    輸出
    X:序列化後的資料
    word_index:文本和數字對應的索引
    '''
    tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)  # 建立一個分詞器
    tokenizer.fit_on_texts(docs)   # 建構商品索引
    X = tokenizer.texts_to_sequences(docs)  # 将商品轉換為整數索引組成的清單
    maxlen = max_len
    X = pad_sequences(X, maxlen=maxlen, value=0)
    word_index=tokenizer.word_index  # 找回商品索引
    return X, word_index
           
valid_item_seq = data_valid_fea.groupby(['user_log_acct'])['item_sku_id'].agg(list).reset_index()
valid_item_seq.columns = ['user_log_acct', 'item_seq']
df_val = df_val.merge(valid_item_seq, on='user_log_acct', how='left')

train_item_seq = data_train_fea1.groupby(['user_log_acct'])['item_sku_id'].agg(list).reset_index()
train_item_seq.columns = ['user_log_acct', 'item_seq']
df_train = df_train.merge(train_item_seq, on='user_log_acct', how='left')

df_data = pd.concat([df_train[['item_seq']], df_val[['item_seq']]], axis=0, ignore_index=True)
           
df_data['item_seq'] = df_data['item_seq'].apply(lambda x:str(x)[1:-1])
text_1_list = list(df_data['item_seq'])

print('開始序列化')
x1, index_1 = set_tokenizer(text_1_list, split_char=',', max_len=20)
print('序列化完成')
gc.collect()
           
開始序列化
序列化完成





0
           
sparse_col = ['item_sku_id','age','sex','user_level','province','city','county','brand_code','shop_id','item_third_cate_cd','vender_id']

rest_col = ['user_log_acct','label_cnt','label_days','user_item','item_seq','label']

dense_cols = []
for i in df_train.columns:
    if df_train[i].dtype in ['float64','int64'] and i not in sparse_col and i not in rest_col:
        dense_cols.append(i)
           
## dense标準化
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
df_data = pd.concat([df_train[dense_cols], df_val[dense_cols]], axis=0, ignore_index=True)
df_data = df_data.fillna(0)
ss.fit(df_data)
dense_feature = ss.transform(df_data)
dense_feature_input = dense_feature.shape[1]
           
# 區分開train和valid
train_input_1 = x1[:df_train.shape[0]]
test_input_1 = x1[df_train.shape[0]:]
train_input_2 = dense_feature[:df_train.shape[0]]
test_input_2 = dense_feature[df_train.shape[0]:]
train_label = df_train['label']
test_label = df_val['label']
           

LightGBM

import lightgbm as lgb
# 特征集:sparse_col+dense_cols
eval_set = [(df_train[sparse_col+dense_cols], df_train['label']), (df_val[sparse_col+dense_cols], df_val['label'])]

lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=2**7-1, reg_alpha=0, reg_lambda=0.01,
                               max_depth=-1, n_estimators=2000, objective='binary', subsample=0.9, 
                               colsample_bytree=0.85, subsample_freq=1, min_child_samples=25,
                               learning_rate=0.1, random_state=2021, metric="None", n_jobs=20) 

lgb_model.fit(df_train[sparse_col+dense_cols], df_train['label'], eval_set = eval_set, eval_metric='auc', verbose=100, early_stopping_rounds=100)
           
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.97877	valid_1's auc: 0.880513
Early stopping, best iteration is:
[16]	valid_0's auc: 0.91334	valid_1's auc: 0.884251





LGBMClassifier(colsample_bytree=0.85, metric='None', min_child_samples=25,
               n_estimators=2000, n_jobs=20, num_leaves=127, objective='binary',
               random_state=2021, reg_alpha=0, reg_lambda=0.01, subsample=0.9,
               subsample_freq=1)
           
# 特征集:dense_cols
eval_set = [(df_train[dense_cols], df_train['label']), (df_val[dense_cols], df_val['label'])]

lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=2**7-1, reg_alpha=0, reg_lambda=0.01,
                               max_depth=-1, n_estimators=2000, objective='binary', subsample=0.9, 
                               colsample_bytree=0.85, subsample_freq=1, min_child_samples=25,
                               learning_rate=0.1, random_state=2021, metric="None", n_jobs=20) 

lgb_model.fit(df_train[dense_cols], df_train['label'], eval_set = eval_set, eval_metric='auc', verbose=100, early_stopping_rounds=100)
           
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.963161	valid_1's auc: 0.878001
Early stopping, best iteration is:
[9]	valid_0's auc: 0.898261	valid_1's auc: 0.881422





LGBMClassifier(colsample_bytree=0.85, metric='None', min_child_samples=25,
               n_estimators=2000, n_jobs=20, num_leaves=127, objective='binary',
               random_state=2021, reg_alpha=0, reg_lambda=0.01, subsample=0.9,
               subsample_freq=1)
           
# 特征集:sparse_col
eval_set = [(df_train[sparse_col], df_train['label']), (df_val[sparse_col], df_val['label'])]

lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=2**7-1, reg_alpha=0, reg_lambda=0.01,
                               max_depth=-1, n_estimators=2000, objective='binary', subsample=0.9, 
                               colsample_bytree=0.85, subsample_freq=1, min_child_samples=25,
                               learning_rate=0.1, random_state=2021, metric="None", n_jobs=20) 

lgb_model.fit(df_train[sparse_col], df_train['label'], eval_set = eval_set, eval_metric='auc', verbose=100, early_stopping_rounds=100)
           
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.935879	valid_1's auc: 0.741442
Early stopping, best iteration is:
[24]	valid_0's auc: 0.847825	valid_1's auc: 0.744424





LGBMClassifier(colsample_bytree=0.85, metric='None', min_child_samples=25,
               n_estimators=2000, n_jobs=20, num_leaves=127, objective='binary',
               random_state=2021, reg_alpha=0, reg_lambda=0.01, subsample=0.9,
               subsample_freq=1)
           

繼續閱讀