機器學習-svd實作人臉識别

加載sklearn中的人臉資料集

from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people()

執行上面的第二行程式，python會從網上下載下傳labeled_face_wild people資料集，這個資料集大概200M，因為牆的原因下載下傳很慢失敗。

使用百度雲下載下傳該資料集，是個

.tgz

的壓縮包

連結：https://pan.baidu.com/s/1eySjV_1K2XYD5YYKCxiVEw

提取碼：3wut

把下載下傳好的壓縮包放入C:\Users\Tim\scikit_learn_data\lfw_home，其中yyy是我的使用者名，再次運作

faces = fetch_lfw_people()

，成功，jupyter notebook中的輸出如下：

from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)

[\'Donald Rumsfeld\' \'George W Bush\' \'Gerhard Schroeder\' \'Junichiro Koizumi\'
 \'Tony Blair\']
(964, 62, 47)

# 進行完上一步還可以看一下圖檔長什麼樣子
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(3, 5)
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap=\'bone\')
    axi.set(xticks=[], yticks=[],
            xlabel=faces.target_names[faces.target[i]])

解決人臉識别（jupyter）

人臉識别是一個分類問題，因為機器學習中svd屬于王霸地位（深度學習不算），是以使用svd對圖像進行訓練。

# svc 支援向量解決分類問題
from sklearn.svm import SVC
# 圖檔的次元太高，降維
from sklearn.decomposition import PCA
# 管道
from sklearn.pipeline import make_pipeline

pca = PCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel=\'rbf\', class_weight=\'balanced\')
model = make_pipeline(pca, svc)

和上一步一樣看看我們資料的同時加載資料

from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(3, 5)
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap=\'bone\')
    axi.set(xticks=[], yticks=[],
            xlabel=faces.target_names[faces.target[i]])

切分訓練集和測試集

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target,
                                                random_state=40)

# 進行訓練
from sklearn.model_selection import GridSearchCV
param_grid = {\'svc__C\': [1, 5, 10],
              \'svc__gamma\': [0.0001, 0.0005, 0.001]}
grid = GridSearchCV(model, param_grid)

%time grid.fit(Xtrain, ytrain)

GridSearchCV(cv=\'warn\', error_score=\'raise-deprecating\',
             estimator=Pipeline(memory=None,
                                steps=[(\'pca\',
                                        PCA(copy=True, iterated_power=\'auto\',
                                            n_components=150, random_state=42,
                                            svd_solver=\'auto\', tol=0.0,
                                            whiten=True)),
                                       (\'svc\',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=\'balanced\', coef0=0.0,
                                            decision_function_shape=\'ovr\',
                                            degree=3, gamma=\'auto_deprecated\',
                                            kernel=\'rbf\', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid=\'warn\', n_jobs=None,
             param_grid={\'svc__C\': [1, 5, 10],
                         \'svc__gamma\': [0.0001, 0.0005, 0.001]},
             pre_dispatch=\'2*n_jobs\', refit=True, return_train_score=False,
             scoring=None, verbose=0)

print(grid.best_params_)

{\'svc__C\': 10, \'svc__gamma\': 0.0001}

檢視測試集的測試結果

model = grid.best_estimator_
yfit = model.predict(Xtest)
yfit.shape
import matplotlib as mpl
# 防止中文報錯
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
# 畫圖
fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
    # 調整像素為[62,47]
    axi.imshow(Xtest[i].reshape(62, 47), cmap=\'bone\')
    axi.set(xticks=[], yticks=[])
    # 截取目标名字的最後一組字
    axi.set_ylabel(faces.target_names[yfit[i]].split()[-1],
                   color=\'black\' if yfit[i] == ytest[i] else \'red\')
fig.suptitle(\'預測錯誤的名字被紅色标注\', size=14);

可以看到預測錯誤了四個，準确率欠佳，下面列印分類報告

from sklearn.metrics import classification_report
print(classification_report(ytest, yfit,
                            target_names=faces.target_names))

precision    recall  f1-score   support

  Donald Rumsfeld       0.75      0.87      0.81        31
    George W Bush       0.97      0.92      0.94       124
Gerhard Schroeder       0.80      0.83      0.81        29
Junichiro Koizumi       1.00      1.00      1.00        16
       Tony Blair       0.85      0.85      0.85        41

         accuracy                           0.90       241
        macro avg       0.87      0.89      0.88       241
     weighted avg       0.90      0.90      0.90       241

最後使用seaborn的heatmap列印混淆矩陣

import seaborn as sns
from sklearn.metrics import confusion_matrix
# 混淆矩陣
mat = confusion_matrix(ytest, yfit)
# 注意這裡的混淆矩陣的畫圖
sns.heatmap(mat.T, square=True, annot=True, fmt=\'d\', cbar=False,
            xticklabels=faces.target_names,
            yticklabels=faces.target_names)
plt.xlabel(\'true label\')
plt.ylabel(\'predicted label\');

對于svd不懂得可以轉頭看一下，svm原理