天天看點

機器學習實戰——特征工程(下)

上篇博文我們一起學習了特征工程

機器學習實戰——特征工程(上)

現在我們對招聘資料進行特征工程探索

具體代碼與源檔案可以從我的GitHub位址擷取

https://github.com/liuzuoping/MeachineLearning-Case

歡迎star~

7.招聘資料的特征工程探索

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
           

讀取源資料

lagou_df = pd.read_csv('./lagou_data5.csv', encoding='gbk')
lagou_df.head()
           
機器學習實戰——特征工程(下)
# advantage和label這兩個特征作用不大,可在最後剔除
# 分類變量one-hot處理
# pandas one-hot方法
pd.get_dummies(lagou_df['city']).head()
           
機器學習實戰——特征工程(下)
# sklearn onehot方法
# 先要寫死labelcoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
lbl.fit(list(lagou_df['city'].values))
lagou_df['city'] = lbl.transform(list(lagou_df['city'].values))
# 檢視寫死結果
lagou_df['city'].head()
           
機器學習實戰——特征工程(下)
# 再由寫死轉為one-hot編碼
df_city = OneHotEncoder().fit_transform(lagou_df['city'].values.reshape((-1,1))).toarray()
df_city[:5]
           
機器學習實戰——特征工程(下)
# 分類特征統一one-hot處理
cat_features = ['city', 'industry', 'education', 'position_name', 'size', 'stage', 'work_year']
for col in cat_features:
    temp = pd.get_dummies(lagou_df[col])
    lagou_df = pd.concat([lagou_df, temp],axis=1)
    lagou_df = lagou_df.drop([col], axis=1)
    
lagou_df.shape
           
(1650, 54)
pd.options.display.max_columns = 999
lagou_df = lagou_df.drop(['advantage', 'label'], axis=1)
lagou_df.head()
           
機器學習實戰——特征工程(下)

職位描述特征的資訊提取

lagou_df2 = pd.read_csv('./lagou_data5.csv', encoding='gbk')
lagou_df2 = lagou_df2[['position_detail', 'salary']]
# 提取Python資訊
for i, j in enumerate(lagou_df2['position_detail']):
    if 'python' in j:
        lagou_df2['position_detail'][i] = j.replace('python', 'Python')
lagou_df2['Python'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'Python' in j:
        lagou_df2['Python'][i] = 1
    else:
        lagou_df2['Python'][i] = 0
        
lagou_df2['Python'][:20]
           
lagou_df2['R'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'R' in j:
        lagou_df2['R'][i] = 1
    else:
        lagou_df2['R'][i] = 0
        
lagou_df2['R'].value_counts()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'sql' in j:
        lagou_df2['position_detail'][i] = j.replace('sql', 'SQL')

lagou_df2['SQL'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'SQL' in j:
        lagou_df2['SQL'][i] = 1
    else:
        lagou_df2['SQL'][i] = 0
        
lagou_df2['SQL'].value_counts()
lagou_df2['Excel'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'Excel' in j:
        lagou_df2['Excel'][i] = 1
    else:
        lagou_df2['Excel'][i] = 0
        
lagou_df2['Excel'].value_counts()
lagou_df2['Java'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'Java' in j:
        lagou_df2['Java'][i] = 1
    else:
        lagou_df2['Java'][i] = 0
        
lagou_df2['Java'].value_counts()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'linux' in j:
        lagou_df2['position_detail'][i] = j.replace('linux', 'Linux')
        
lagou_df2['Linux'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'Linux' in j:
        lagou_df2['Linux'][i] = 1
    else:
        lagou_df2['Linux'][i] = 0
        
lagou_df2['Linux'].value_counts()
lagou_df2['C++'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'C++' in j:
        lagou_df2['C++'][i] = 1
    else:
        lagou_df2['C++'][i] = 0
        
lagou_df2['C++'].value_counts()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'spark' in j:
        lagou_df2['position_detail'][i] = j.replace('spark', 'Spark')

lagou_df2['Spark'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'Spark' in j:
        lagou_df2['Spark'][i] = 1
    else:
        lagou_df2['Spark'][i] = 0
        
lagou_df2['Spark'].value_counts()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'tensorflow' in j:
        lagou_df2['position_detail'][i] = j.replace('tensorflow', 'Tensorflow')
        
    if 'TensorFlow' in j:
        lagou_df2['position_detail'][i] = j.replace('TensorFlow', 'Tensorflow')
        
lagou_df2['Tensorflow'] = pd.Series()
for i, j in enumerate(lagou_df2['position_detail']):
    if 'Tensorflow' in j:
        lagou_df2['Tensorflow'][i] = 1
    else:
        lagou_df2['Tensorflow'][i] = 0
        
lagou_df2['Tensorflow'].value_counts()
lagou_df2 = lagou_df2.drop(['position_detail'], axis=1)
lagou_df2.head()
           
機器學習實戰——特征工程(下)

繼續閱讀