本部落格旨在幫助學生自己鞏固所學,若能幫得上他人也是榮幸之至
首先以下是借鑒過的幾個github庫,非常感謝:
https://github.com/apachecn/python_data_analysis_and_mining_action
https://github.com/keefecn/python_practice_of_data_analysis_and_mining
https://github.com/Stormzudi/Python-Data-Mining
https://github.com/Echo9573/DataAnalysisbyPython
主要參考https://blog.csdn.net/u012063773/article/details/79302599
1 擷取目前工作目錄及子目錄下所有圖檔檔案的絕對路徑,包含其所有子檔案夾中的圖檔
from __future__ import division
from PIL import Image
import cv2
import numpy as np
import os
from pandas import DataFrame
import pandas as pd
# 擷取目前工作目錄及子目錄下所有圖檔檔案的絕對路徑,包含其所有子檔案夾中的圖檔
def getimgdir(imgfilename):
imgdirs = []
imgTypes = [".png", ".jpg", ".bmp"]
if imgfilename:
presentfiles = imgfilename
else:
presentfiles = os.getcwd() # 獲得目前工作目錄
for root, dirs, files in os.walk("."):
r = root[2:]
for afile in files:
if r != '':
ffile = presentfiles + "\\" + r + "\\" + afile
else:
ffile = presentfiles + "\\" + afile
if ffile[ffile.rindex("."):].lower() in imgTypes:
imgdirs.append(ffile)
return imgdirs
2 擷取目前工作目錄及子目錄下所有圖檔檔案的絕對路徑 # 不包含下層檔案夾中的圖檔
def getimgdir_designed(imgfilename):
if os.path.exists(imgfilename)== False:# 若指定的檔案夾不存在,則提示!
print('你設定的指定檔案夾不存在!')
return None
imgdirs = []
imgTypes = [".png", ".jpg", ".bmp"]
presentfiles = imgfilename#獲得目前工作目錄
recursion = 0 # 控制遞歸深度,隻遞歸目前目錄
for root, dirs, files in os.walk(presentfiles):
for afile in files:
ffile = presentfiles + "\\" + afile
if ffile[ffile.rindex("."):].lower() in imgTypes:
imgdirs.append(ffile)
if (not recursion):
break
return imgdirs
3 将圖檔切割成(2halfw)(2*halfh)像素的檔案,并傳回切割後的檔案的絕對路徑
'''将圖檔切割成(2*halfw)*(2*halfh)像素的檔案,并傳回切割後的檔案的絕對路徑
src是待切割的檔案的絕對路徑,halfw是切割後圖檔的寬度的一半,
halfh是切割後圖檔的長度的一半,dstpath是切割後圖檔的儲存路徑
注意:在切割圖檔時,要先確定要處理的圖檔中央位置是有效圖檔,若不是,則需要進行圖檔處理'''
def splitimage(src, halfw, halfh, dstpath):
img = Image.open(src)
w, h = img.size
s = os.path.split(src)
if dstpath == '':
dstpath = s[0]
fn = s[1].split('.')
basename = fn[0]
ext = fn[-1]
box = (h // 2 - halfh, w // 2 - halfw, h // 2 + halfh, w // 2 + halfw)
pic_cut_name = os.path.join(dstpath, basename + '_cut' + '.' + ext)
img.crop(box).save(pic_cut_name)
return pic_cut_name
4 顔色矩方式進行特征提取
# 顔色矩方式進行特征提取
def color_moments(filename):
img = cv2.imread(filename)
if img is None:
return
# Convert BGR to HSV colorspace
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# Split the channels - h,s,v
h, s, v = cv2.split(hsv)
# 初始化顔色特征
color_feature = []
# 一階中心矩求解 sum(x)/N = mean ---均值
h_mean = np.mean(h) # np.sum(h)/float(N)
s_mean = np.mean(s) # np.sum(s)/float(N)
v_mean = np.mean(v) # np.sum(v)/float(N)
color_feature.extend([h_mean, s_mean, v_mean])
# 二階中心矩求解 (sum(x-mean)/N)**(1/2) = std ---方差
h_std = np.std(h) # np.sqrt(np.mean(abs(h - h.mean())**2))
s_std = np.std(s) # np.sqrt(np.mean(abs(s - s.mean())**2))
v_std = np.std(v) # np.sqrt(np.mean(abs(v - v.mean())**2))
color_feature.extend([h_std, s_std, v_std])
# 三階中心矩求解 ((sum(x-mean))**(1/3)/N)**(1/3)
h_skewness = np.mean((h - h.mean())**3)
s_skewness = np.mean((s - s.mean())**3)
v_skewness = np.mean((v - v.mean())**3)
h_thirdMoment = abs(h_skewness)**(1./3) * (-1 if h_skewness < 0 else 1)
s_thirdMoment = abs(s_skewness)**(1./3) * (-1 if s_skewness < 0 else 1)
v_thirdMoment = abs(v_skewness)**(1./3) * (-1 if v_skewness < 0 else 1)
color_feature.extend([h_thirdMoment, s_thirdMoment, v_thirdMoment])
return color_feature
5 縱向連接配接多個表格
6 随機打亂順序
from random import shuffle # 引入随機函數
shuffle(data)
# 或者
inputfile = 'moment.csv'
data = pd.read_csv(inputfile, encoding='gbk')
# 注意,此處不能用shuffle
sampler = np.random.permutation(len(data))
d = data.take(sampler).values
支援向量機SVM進行分類,得混淆矩陣,準确率,混淆矩陣圖
# coding: utf-8
# 9.2.2 模型建構
### 1 模型輸入 # 80% 做訓練集、20% 做測試集
# 資料抽樣代碼
import pandas as pd
from pandas import DataFrame,Series
import random
import numpy as np
inputfile = 'moment.csv'
data = pd.read_csv(inputfile, encoding='gbk')
# 注意,此處不能用shuffle
sampler = np.random.permutation(len(data))
d = data.take(sampler).values
data_train = d[:int(0.8*len(data)),:] #選取前80%做訓練集
data_test = d[int(0.8*len(data)):,:] #選取後20%做測試集
# 建構支援向量機模型代碼
x_train = data_train[:, 2:]*30 #放大特征
y_train = data_train[:,0].astype(int)
x_test = data_test[:, 2:]*30 #放大特征
y_test = data_test[:,0].astype(int)
# 導入模型相關的支援向量機函數 建立并且訓練模型
from sklearn import svm
model = svm.SVC()
model.fit(x_train, y_train)
import pickle
pickle.dump(model, open('svcmodel.model','wb'))# model = pickle.load(open('svcmodel.model','rb'))
# 導入輸出相關的庫,生成混淆矩陣
from sklearn import metrics
cm_train = metrics.confusion_matrix(y_train, model.predict(x_train)) # 訓練樣本的混淆矩陣
cm_test = metrics.confusion_matrix(y_test, model.predict(x_test)) # 測試樣本的混淆矩陣
df1 = DataFrame(cm_train, index = range(1,6), columns=range(1,6))
df2 = DataFrame(cm_test, index = range(1,6), columns=range(1,6))
df1.to_excel('trainPre.xlsx')
df2.to_excel('testPre.xlsx')
print(model.score(x_train,y_train)) # 評價模型訓練的準确率
print(model.score(x_test,y_test)) # 評價模型測試的準确率
# import matplotlib.pyplot as plt #導入作圖庫
# get_ipython().magic(u'matplotlib inline')
# plt.matshow(cm_test, cmap=plt.cm.Greens) #畫混淆矩陣圖,配色風格使用cm.Greens,更多風格請參考官網。
# plt.colorbar() #顔色标簽
#
# for x in range(len(cm_test)): #資料标簽
# for y in range(len(cm_test)):
# plt.annotate(cm_test[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center')
#
# plt.ylabel('True label') #坐标軸标簽
# plt.xlabel('Predicted label') #坐标軸标簽
# plt.show()
#'''
#等價于下面這段
from cm_plot import *
cm_plot(y_train, model.predict(x_train)).show() # cm_plot是自定義的畫混淆矩陣的函數
#'''