預測
如下代碼,使用邏輯回歸生存預測。
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
data=pd.read_csv(r"E:\MLdata\kaggle_titanic\processing.csv")
# print(data)
data_train=data[data['Survived']!=-1].drop(["PassengerId"],axis=1)
data_train_x=data_train.iloc[:,:-1]
data_train_y=data_train.iloc[:,-1]
# print(data_train_x.columns)
Lr=LogisticRegression()
Lr.fit(data_train_x,data_train_y)
data_predict=data[data['Survived']==-1].drop(['Survived'],axis=1)
data_predict_x=data_predict.drop(['PassengerId'],axis=1)
# print(data_predict_x.columns)
label=Lr.predict(data_predict_x)
# standard_label=pd.read_csv(r"E:\MLdata\kaggle_titanic\gender_submission.csv")["Survived"]
# print(np.sum(standard_label==label)/standard_label.shape[0])
data_predict.insert(data_predict.columns.size,"Survived",label)
df=pd.DataFrame(data=data_predict.loc[:,['PassengerId',"Survived"]].values,columns=["PassengerId","Survived"])
df.to_csv(r"E:\MLdata\kaggle_titanic\res\t2.csv",index=False)
構造特征
特征構造主要考慮如下方面:
- 數值特征的非線性因素,如對一些數值特征進行指數放大、縮小
- 特征與特征之間的關聯關系
由于兒童更能得到照顧,是以構造特征:
data_train['isChild']=(data_train['Age']<=10).astype(int)
# data_train['isChild'][:3]
# 0 0
# 1 0
# 2 0
# Name: isChild, dtype: int32
年齡越大越不易生存,是以放大年齡:
data_train['Age']=data_train['Age']*data_train['Age']
考慮幾等艙(比如一等)和年輕的更容易生存,故構造特征Age∗Class
A
g
e
∗
C
l
a
s
s
:
data_train['Age_Pclass']=data_train['Age']*data_train['Pclass']
import sklearn.preprocessing as preprocessing
scaler=preprocessing.StandardScaler()
data_train['Age']=scaler.fit_transform(data_train['Age'].values.reshape(-1,1))
data_train['Age_Pclass']=scaler.fit_transform(data_train['Age_Pclass'].values.reshape(-1,1))