賽題介紹
賽題以醫療資料挖掘為背景,要求選手使用提供的心跳信号傳感器資料訓練模型并完成不同心跳信号的分類的任務。
賽事位址,目前長期賽已經開放,大家都可以前往報名參與學習:
https://tianchi.aliyun.com/competition/entrance/531883/introduction
賽事資料集解釋
該資料來自某平台心電圖資料記錄,總資料量超過20萬,主要為1列心跳信号序列資料,其中每個樣本的信号序列采樣頻次一緻,長度相等。資料集均為205維的心跳時序資料。
個人賽題了解
本次賽題實質是時序資料的分類問題。
資料集均為205維的心跳時序資料,且均進行脫敏和歸一化處理,進行EDA後,不存在空值。
需要實作自定義評價函數,後處理函數。
醫學上将形态學特征作為心電圖信号診斷的主要分析依據之一,是以對于發現心電信号的形态學特性尤為重要。這裡選擇對形态學特征敏感的深度學習神經網絡架構,并融合多個模型。
解決方案
自定義評價函數與後處理函數
預測結果與實際心跳類型結果進行對比,求預測的機率與真實值內插補點的絕對值(越小越好)。根據賽題評測标準,計算得分是心跳信号預測的機率與實際心跳類型結果內插補點之和,是以可以使用對機率做後處理來降低計算得分。
經測試,後處理函數門檻值取值為0.5時,效果最佳。
# 自定義評價函數
def abs_sum(y_pre,y_tru):
y_pre=np.array(y_pre)
y_tru=np.array(y_tru)
loss=sum(sum(abs(y_pre-y_tru)))
return loss
# 自定義後處理函數
def postprocessing(test):
temp=pd.DataFrame(test)
for index, row in temp.iterrows():
row_max = max(list(row)[::])
row_min = min(list(row)[::])
if row_max > 0.5: # 最大值界限
for i in range(4):
if row[i] > 0.5:
temp.iloc[index,i] = 1
else:
temp.iloc[index,i] = 0
elif row_min < 0.5: # 最小值界限
for i in range(4):
if row[i] < 0.5:
temp.iloc[index,i] = 0
num = np.nonzero(list(temp.iloc[index]))
if len(num[0]) == 1:
temp.iloc[index,num[0][0]] = 1
return temp
模型建構
為增強模型魯棒性,選擇5折交叉驗證;
合理使用Dropout、BatchNormalization、callbacks 防止過拟合,增強預測效果
# 交叉驗證分組 —— 5折
folds = 5
seed = 2021 #定義随機種子
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
onehot_encoder = OneHotEncoder(sparse=False) # 用于标簽 onehot 編碼
CNN模型
# 定義 CNN 模型
def CNN_model():
nclass = 4
inp = Input(shape=(205, 1))
img_1 = Convolution1D(32, kernel_size=5, activation=activations.relu, padding="same")(inp)
img_1 = Convolution1D(64, kernel_size=5, activation=activations.relu, padding="same")(img_1)
img_1 = Convolution1D(128, kernel_size=5, activation=activations.relu, padding="same")(img_1)
img_1 = MaxPool1D(pool_size=2)(img_1)
img_1 = Dropout(rate=0.6)(img_1)
img_1 = Flatten(name = 'flatten')(img_1)
dense_1 = Dense(512, activation=activations.relu, name="dense_1")(img_1)
dense_1 = Dense(1024, activation=activations.relu, name="dense_2")(dense_1)
dense_1 = Dense(nclass, activation=activations.softmax, name="dense_3_mitbih")(dense_1)
model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
# model.summary()
return model
# 模型訓練
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
# 模型訓練
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]
trn_x = np.array(trn_x)[..., np.newaxis]
val_x = np.array(val_x)[..., np.newaxis]
model_CNN = CNN_model()
file_path = "baseline_cnn_datawhale_transfer_fullupdate_" + str(i+1) + ".h5" # 儲存每輪訓練的最優模型
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5, verbose=1)
redonplat = ReduceLROnPlateau(monitor="val_acc", mode="max", patience=3, verbose=2)
callbacks_list = [checkpoint, early, redonplat] # early
model_CNN.load_weights("baseline_cnn_datawhale_1fold.h5", by_name=True) # 加載單折訓練模型權重作為網絡初始權重
history = model_CNN.fit(trn_x, trn_y, epochs=1000, verbose=2, callbacks=callbacks_list, validation_data=(val_x,val_y))
test_pred_CNN = model_CNN.predict(x_test)
test_CNN = copy.deepcopy(test/kf.n_splits)
LSTM模型
# 定義 LSTM 模型
def CuDNNLSTM_model():
nclass = 4
model = models.Sequential() #需要使用
model.add(CuDNNLSTM(32, return_sequences=True, input_shape=(205, 1)))
model.add(CuDNNLSTM(64, return_sequences = True))
model.add(CuDNNLSTM(128, return_sequences = True))
model.add(MaxPool1D(pool_size=2))
model.add(Dropout(0.6))
model.add(Flatten())
model.add(Dense(512, activation = 'relu'))
model.add(Dense(1024, activation = 'relu'))
model.add(Dense(nclass, activation=activations.softmax, name="dense_2_tianchi"))
opt = optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
# model.summary()
return model
# 模型訓練
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
# 模型訓練
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]
trn_x = np.array(trn_x)[..., np.newaxis]
val_x = np.array(val_x)[..., np.newaxis]
model_CuDNNLSTM = CuDNNLSTM_model()
file_path = "baseline_CuDNNLSTM_datawhale_transfer_fullupdate_" + str(i+1) + ".h5" # 儲存每輪訓練的最優模型
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5, verbose=1)
redonplat = ReduceLROnPlateau(monitor="val_acc", mode="max", patience=3, verbose=2)
callbacks_list = [checkpoint, early, redonplat] # early
model_CuDNNLSTM.load_weights("baseline_CuDNNLSTM_datawhale_1fold.h5", by_name=True) # 加載單折訓練模型權重作為網絡初始權重
history = model_CuDNNLSTM.fit(trn_x, trn_y, epochs=1000, verbose=2, callbacks=callbacks_list, validation_data=(val_x,val_y))
test_pred_CuDNNLSTM = model_CuDNNLSTM.predict(x_test)
test_CuDNNLSTM = copy.deepcopy(test/kf.n_splits)
ResNet50 模型
# 定義ResNet50 基本塊 Identity Block:加深網絡
def identity_block(input_tensor, kernel_size, filters, stage, block):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
# 降維
x = Convolution1D(filters1, 1, name=conv_name_base + '2a')(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
# 3*3卷積
x = Convolution1D(filters2, kernel_size,padding='same', name=conv_name_base + '2b')(x)
x = BatchNormalization(name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
# 升維
x = Convolution1D(filters3, 1, name=conv_name_base + '2c')(x)
x = BatchNormalization(name=bn_name_base + '2c')(x)
x = layers.add([x, input_tensor])
x = Activation('relu')(x)
return x
# 定義ResNet50 基本塊 Conv Block:改變網絡的次元
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=2):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
# 降維
x = Convolution1D(filters1, 1, strides=strides,
name=conv_name_base + '2a')(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
# 3*3卷積
x = Convolution1D(filters2, kernel_size, padding='same',
name=conv_name_base + '2b')(x)
x = BatchNormalization(name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
# 升維
x = Convolution1D(filters3, 1, name=conv_name_base + '2c')(x)
x = BatchNormalization(name=bn_name_base + '2c')(x)
# 殘差邊
shortcut = Convolution1D(filters3, 1, strides=strides,
name=conv_name_base + '1')(input_tensor)
shortcut = BatchNormalization(name=bn_name_base + '1')(shortcut)
x = layers.add([x, shortcut])
x = Activation('relu')(x)
return x
# 定義 ResNet50 模型
def ResNet50_model(input_shape=[205, 1],classes=4):
img_input = Input(shape=input_shape)
x = ZeroPadding1D(3)(img_input)
x = Convolution1D(64, 7, strides=2, name='conv1')(x) # [102, 64]
x = BatchNormalization(name='bn_conv1')(x)
x = Activation('relu')(x)
x = MaxPool1D(3, strides=2)(x) # [51, 64]
# [51, 256]
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=1)
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
# [25, 512]
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
# [12, 1024]
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
# [6, 2048]
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
# 代替全連接配接層
x = AveragePooling1D(6, name='avg_pool')(x)
# 進行預測
x = Flatten()(x)
x = Dense(classes, activation='softmax', name='fc1000')(x)
model = models.Model(img_input, x, name='resnet50')
opt = optimizers.Adam(0.001)
model.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
return model
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
# if i in range(2): continue
# 模型訓練
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = x_train.iloc[train_index], y_train[train_index], x_train.iloc[valid_index], y_train[valid_index]
trn_x = np.array(trn_x)[..., np.newaxis]
val_x = np.array(val_x)[..., np.newaxis]
model_ResNet50 = ResNet50_model()
file_path = "baseline_ResNet50_datawhale_transfer_fullupdate_" + str(i+1) + ".h5" # 儲存每輪訓練的最優模型
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5, verbose=1)
redonplat = ReduceLROnPlateau(monitor="val_acc", mode="max", patience=3, verbose=2)
callbacks_list = [checkpoint, early, redonplat] # early
model_ResNet50.load_weights("baseline_ResNet50_datawhale_1fold.h5", by_name=True) # 加載單折訓練模型權重作為網絡初始權重
history = model_ResNet50.fit(trn_x, trn_y, epochs=1000, verbose=2, callbacks=callbacks_list, validation_data=(val_x,val_y))
test_pred_ResNet50 = model_ResNet50.predict(x_test)
test_ResNet50 = copy.deepcopy(test/kf.n_splits)
模型融合
這裡使用了簡單的均值權重融合
temp_CNN = copy.deepcopy(test_CNN)
temp_CuDNNLSTM = copy.deepcopy(test_CuDNNLSTM)
temp_ResNet50 = copy.deepcopy(test_ResNet50)
w = [1/3, 1/3, 1/3]
test_pre = Weighted_method(temp_CNN, temp_CuDNNLSTM, temp_ResNet50, w)
temp = postprocessing(test_pre)
不足
- EDA分析不夠
- 未為對資料樣本類别不均衡問題進行處理
- 單模型參數優化工作不夠
- 模型融合太過簡單
- ...
參賽感受和建議
第一次參加,還存在很多的不足之處,對于某些深度學習架構運用還不夠熟練,有待提高。但是比賽還是收獲頗豐。
比賽初期:學習Datawhale與天池聯合推出的相關教學方案,跟着baseline的思路走,試了一下常見的樹模型,XGBoost、LightGBM、Catboost,效果不是很理想,score在400到500之間,排行榜都進不了,當時100名為380.實話說,當時心态很差,繼續調整。
比賽中期:嘗試使用普通的神經網絡方法,增加預測資料後處理函數,score能達到400到450之間,但依然上不了榜,甚至模型還存在過拟合現象。最後轉移思路,尋找合适的深度學習方法,經過多種嘗試選擇了3種深度學習神經網絡模型:CNN,LSTM,ResNet50。score達到300左右。順利進入榜單中下區域。
比賽後期:針對三種單模型進行參數設定手動調優,對預測結果後處理函數進行手動門檻值調優,最後選擇均值權重融合,最後再A榜達到200分作用的成績,于B榜得到226分的成績,慶幸模型魯棒性較好。
限于時間等相關原因,模型還存在很多不足。
通過此次比賽,收獲頗豐,希望參加比賽的小夥伴們,不要因為模型結果不理想而輕易放棄,結合資料本身特征進行思考,也許換一個思路,豁然開朗,船到橋頭自然直。最後感謝主辦方給我們大家提供這麼好,這麼優秀的學習交流平台。

關注阿裡雲天池,人人都可以玩轉大資料