步驟一:了解需求背景和挖掘目标;
步驟二:分析并建立流程圖;

步驟三:探索,并處理資料;
步驟四:資料模組化及應用,及可視化;
實作代碼:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans # 引入sklearn子產品裡的機器學習算法 k_means
class AirLightData():
def detectData(self, filePath):
'''
探索資料
:param filePath:檔案路徑
:return: 無 (資料的DF結構 (dataframe))
'''
df = pd.read_csv(filePath)
describe = df.describe(include='all')
print(describe.T)
# df.to_excel('data/air_data.xls')
describe.T.to_excel('data/air_descrlib.xls')
pass
def cleanData(self,filePath):
'''
清洗掉無效資料:空值行、不在合理的資料範圍内的行
:param filePath:
:return:
'''
df = pd.read_csv(filePath)
# 就是用布爾(bool)索引過濾
# 過濾非法的票價
filter1 = df['SUM_YR_1'].notnull() | df['SUM_YR_2'].notnull() # 過濾同為空
filter2 = (df['SUM_YR_1'] != 0) | (df['SUM_YR_2'] != 0)
filter3 = df['avg_discount'] != 0
filter4 = df['SEG_KM_SUM'] != 0
filter = filter1 & filter2 & filter3 & filter4
df = df[filter]
df.to_excel('data/air_cleaned.xls')
# 過濾非法的 裡程數 和 折扣率%
pass
def chooseData(self, filePath):
'''
從清洗之後的資料選取需要的列
:param filePath:
:return:
'''
df = pd.read_excel(filePath)
df = df[['FFP_DATE', 'LOAD_TIME', 'FLIGHT_COUNT', 'SEG_KM_SUM', 'LAST_TO_END', 'avg_discount']]
df.to_excel('data/air_coredata.xls')
pass
def transformData(self, filePath):
df = pd.read_excel(filePath)
df['L'] = (pd.to_datetime(df['LOAD_TIME'])-pd.to_datetime(df['FFP_DATE']))/30
df['R'] = df['LAST_TO_END']/30
df['F'] = df['FLIGHT_COUNT']
df['M'] = df['SEG_KM_SUM']
df['C'] = df['avg_discount']
df = df[['L', 'R', 'F', 'M', 'C']]
df.to_excel('data/air_coretransformdata.xls')
pass
def standarData(self,filePath):
'''
一般标準化的方式:(原資料-平均值)/ 标準差
:param filePath:
:return:
'''
df = pd.read_excel(filePath)
df = (df - np.mean(df, axis=0))/np.std(df, axis=0)
df[['L', 'R', 'F', 'M', 'C']].to_excel('data/air_stdcoredata.xls')
pass
def classifyData(self, filePath, k=5):
df = pd.read_excel(filePath)
kmeans = KMeans(k)
kmeans.fit(df[['L', 'R', 'F', 'M', 'C']])
print(kmeans.cluster_centers_)
print(kmeans.labels_)
df['label'] = kmeans.labels_
# df.to_excel('data/air_result.xls')
# corData = pd.DataFrame(kmeans.cluster_centers_)
# corData.to_excel('data/air_core.xls')
corData = np.array(kmeans.cluster_centers_)
# 繪制雷達圖
# 1、組織資料
# 構造x軸軸值
xdata = np.linspace(0, 2*np.pi, k,endpoint=False)
xdata = np.concatenate((xdata, [xdata[0]]))
ydata1 = np.concatenate((corData[0], [corData[0][0]]))
ydata2 = np.concatenate((corData[1], [corData[1][0]]))
ydata3 = np.concatenate((corData[2], [corData[2][0]]))
ydata4 = np.concatenate((corData[3], [corData[3][0]]))
ydata5 = np.concatenate((corData[4], [corData[4][0]]))
fig = plt.figure()
ax = fig.add_subplot(111, polar=True)
ax.plot(xdata, ydata1, 'bo--', linewidth=1,label='customer1')
ax.plot(xdata, ydata2, 'ro--', linewidth=1, label='customer2')
ax.plot(xdata, ydata3, 'go--', linewidth=1, label='customer3')
ax.plot(xdata, ydata4, 'yo--', linewidth=1, label='customer4')
ax.plot(xdata, ydata5, 'co--', linewidth=1, label='customer5')
ax.set_thetagrids(xdata * 180 / np.pi, ['L', 'R', 'F', 'M','C'])
ax.set_rlim(-3, 3)
plt.legend(loc='best')
plt.show()
pass
pass
if __name__ == "__main__":
ad = AirLightData()
# ad.detectData('data/air_data.csv')
# ad.cleanData('data/air_data.csv')
# ad.chooseData('data/air_cleaned.xls')
# ad.transformData('data/air_coredata.xls')
# ad.standarData('data/air_coretransformdata.xls')
ad.classifyData('data/air_stdcoredata.xls', k=5)
pass
最後效果展示: