SVM在中等次元的分類問題中,有較好的表現,其在某種程度上建構了一個簡單的網絡結構,類似于神經網絡中的RBF神經網絡。
人臉資料集是經典的分類和聚類問題中經常使用的資料集,次元相對不高,灰階圖像,這裡選用64*64的人臉圖像,将其reshape從1*64^2的一維數組,共40類樣本,每組10個。
通常在SVM解決較高次元問題時,需要将其适度降維,這裡選用傳統的線性降維方法PCA(KPCA同樣适用,隻是人臉資料集PCA已經夠用,不用換成更複雜的非線性降維)。
- 導入相關子產品;
- 擷取資料;
- 标準化(這裡可以不用,圖像資料一般已經是标準的)
- 比較降維到不同的次元下,各自的效果對比(這時SVM的參數不變,選用RBF核函數)
- 用網格搜尋最佳的pca參數和SVM參數,效果示範。
以下為實驗代碼子產品
(1)導入子產品
# -*-encoding:utf-8-*-
'''
created by zwg in 2017-03-01
'''
import numpy,time
from sklearn import datasets
from sklearn import svm
from sklearn import decomposition
from sklearn import manifold
from sklearn.cross_validation import train_test_split as tts
from sklearn import svm,neural_network
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score
from sklearn import pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.grid_search import GridSearchCV
from matplotlib import pyplot
import matplotlib.colors as colors
(2)擷取資料
def get_data():
face_data=datasets.fetch_olivetti_faces()
#face_data=datasets.load_iris()
data=face_data.data
target=face_data.target
return data,target
(3)PCA函數
def pca(x,n):
pca_learner=decomposition.PCA(n_components=n)
x=pca_learner.fit_transform(x)
return x
(4)PCA & SVM(傳回得分系數)
def pca_svm(pca_n=10,svm_C=1):
t1=time.time()
data,target=get_data()
#scale_learner=StandardScaler()
#data=scale_learner.fit_transform(data)
x_train,x_test,y_train,y_test=tts(data,target,random_state=33)
pca_learner=decomposition.PCA(n_components=pca_n)
x_train=pca_learner.fit_transform(x_train)
svm_learner=svm.SVC(C=svm_C)
svm_learner.fit(x_train,y_train)
x_test_pre=pca_learner.transform(x_test)
y_test_pre=svm_learner.predict(x_test_pre)
# report=classification_report(y_test,y_test_pre)
# print 'The Main Explanied: ',numpy.sum(pca_learner.explained_variance_ratio_)
# print report
# print x_test_pre.shape,y_test_pre.shape,y_test.shape
ac=svm_learner.score(x_test_pre,y_test)
p=precision_score(y_test,y_test_pre,average='weighted')
r=recall_score(y_test,y_test_pre,average='weighted')
f1=2.0/(1.0/p+1.0/r)
t=time.time()-t1
return ac,p,r,f1,t
(5)PCA降維到不同次元下效果比較
def pca_svm_time_score_compare():
ac_score=[]
p_score=[]
r_score=[]
f1_score=[]
tt=[]
stand=MinMaxScaler((20,30))
steps=numpy.arange(10,410,10)
for n in steps:
ac,p,r,f1,t=pca_svm(pca_n=n)
p_score.append(p)
f1_score.append(f1)
r_score.append(r)
ac_score.append(ac)
tt.append(t)
p_score_stand=stand.fit_transform(numpy.array(p_score).reshape((-1,1)))
r_score_stand=stand.fit_transform(numpy.array(r_score).reshape((-1,1)))
f1_score_stand=stand.fit_transform(numpy.array(f1_score).reshape((-1,1)))
ac_score_stand=stand.fit_transform(numpy.array(ac_score).reshape((-1,1)))
figure=pyplot.figure()
pyplot.subplot(2,1,1)
pyplot.scatter(steps,f1_score,label='f1-score',color='red',s=p_score_stand,alpha=0.7)
pyplot.scatter(steps,r_score,label='recall-score',color='blue',s=r_score_stand,alpha=0.7)
pyplot.scatter(steps,p_score,label='precision-score',color='yellow',s=f1_score_stand,alpha=0.7)
pyplot.scatter(steps,ac_score,label='accuracy-score',color='purple',s=ac_score_stand,alpha=0.7)
pyplot.xlabel('n-components')
pyplot.ylabel('score')
pyplot.legend()
pyplot.title('The Score Of SVM After PCA To N_components')
pyplot.subplot(2,1,2)
pyplot.plot(steps,tt,label='cost-time',color='black',marker='o')
# for i in range(len(tt)):
# pyplot.text(steps[i],ac_score[i],str(round(tt[i],1))+'s',fontdict=dict(size=10,weight='normal'))
# pyplot.plot([steps[i],steps[i]],[0,ac_score[i]],'--b')
pyplot.legend()
pyplot.xlabel('n-components')
pyplot.ylabel('time')
pyplot.show()
(6)網格搜尋pca和SVM最佳參數,并進行可視化
# pca before svm fitting is better
def pca_svm_pipeline():
#svm_C=numpy.linspace(0.5,10,10)
svm_C=[1]
pca_n_components=numpy.arange(5,200,10)
data,target=get_data()
x_train,x_test,y_train,y_test=tts(data,target,random_state=33)
#scale_learner=StandardScaler()
pca_learner=decomposition.PCA()
svm_learner=svm.SVC()
pipe=pipeline.Pipeline([('pca',pca_learner),('svm',svm_learner)])
gscv=GridSearchCV(pipe,
{'pca__n_components':pca_n_components,'svm__C':svm_C},n_jobs=-1)
gscv.fit(x_train,y_train)
y_test_pre=gscv.predict(x_test)
report=classification_report(y_test,y_test_pre)
print gscv.best_params_
print report
target_pre=gscv.predict(data)
n1,n2=data.shape
figure=pyplot.figure()
L=numpy.zeros((40,))
xx=numpy.linspace(0,1,64)+13
yy=numpy.linspace(1,0,64)+13
xx,yy=numpy.meshgrid(xx,yy)
for i in xrange(n1):
k=target_pre[i]
g=L[k]
L[k]+=1
xx1=xx-k
yy1=yy-g
pyplot.contourf(xx1,yy1,data[i].reshape((64,64)),cmap='gray')
if target[i]!=target_pre[i]:
pyplot.scatter(numpy.mean(xx1),numpy.mean(yy1),marker='x',c='red',s=40)
pyplot.axis('off')
pyplot.grid('off')
pyplot.title('PCA & SVM Recongnize Faces')
pyplot.show()
(7)調用
if __name__=='__main__':
pca_svm_pipeline() #Grid Search and show the results
pca_svm_time_score_compare() #Direct Search
結果:(1)降維至10~60多元時,效果最好。
![](https://img.laitimes.com/img/__Qf2AjLwojIjJCLyojI0JCLiIXZ05WZD9CX5RXa2Fmcn9CXwczLcVmds92czlGZvwVP9EUTDZ0aRJkSwk0LcxGbpZ2LcBDM08CXlpXazRnbvZ2LcRlMMVDT2EWNvwFdu9mZvw1MFpnTmlzVk5GbXp1Mk1mYohWblZXUYpVd1kmYr50MZV3YyI2cKJDT29GRjBjUIF2LcRHelR3LcJzLctmch1mclRXY39TMxkTOwETM1ETOwQDM3EDMy8CX0Vmbu4GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)
結果:(2)最佳參數以及分類結果,pca降至35維,SCM參數C=1,
{'pca__n_components': 35, 'svm__C': 1}
precision recall f1-score support
0 1.00 0.80 0.89 5
1 1.00 1.00 1.00 1
3 1.00 0.67 0.80 3
4 1.00 1.00 1.00 1
5 1.00 1.00 1.00 1
6 1.00 1.00 1.00 3
7 0.75 1.00 0.86 3
8 1.00 1.00 1.00 1
9 0.67 1.00 0.80 2
10 1.00 1.00 1.00 1
11 1.00 1.00 1.00 1
12 0.50 0.50 0.50 2
13 1.00 1.00 1.00 1
14 1.00 1.00 1.00 3
15 1.00 0.50 0.67 2
17 1.00 1.00 1.00 2
18 1.00 1.00 1.00 2
19 1.00 1.00 1.00 3
20 1.00 1.00 1.00 2
21 1.00 1.00 1.00 2
22 0.67 1.00 0.80 2
23 1.00 0.50 0.67 2
24 1.00 1.00 1.00 4
25 1.00 1.00 1.00 1
26 1.00 1.00 1.00 4
27 1.00 1.00 1.00 4
28 1.00 1.00 1.00 2
29 1.00 1.00 1.00 4
30 1.00 1.00 1.00 6
31 1.00 1.00 1.00 1
32 1.00 1.00 1.00 4
33 1.00 1.00 1.00 4
34 1.00 1.00 1.00 4
35 1.00 1.00 1.00 3
36 1.00 1.00 1.00 5
37 1.00 1.00 1.00 2
38 1.00 1.00 1.00 4
39 0.75 1.00 0.86 3
avg / total 0.96 0.95 0.95 100