- 本文介紹如何利用Matlab從頭搭建深度前饋神經網絡,實作手寫字型mnist資料集的識别,以及展示各類優化算法的訓練效果,包括SGD、mSGD、AdaGrad、RMSProp、Adam,最終網絡的識别率能達到98%。讀者可自行調整網絡結構和參數。
-
本文的Matlab代碼和mnist資料集下載下傳位址為:
連結: https://pan.baidu.com/s/1Me0T2xZwpn3xnN7b7XPrbg 密碼: t23u
首先,神經網絡的激活函數選擇Sigmoid和Relu兩種。神經網絡的隐藏層選擇Relu激活函數,輸出層選擇Sigmoid激活函數。
function [y] = sigmoid(x)
%sigmoid sigmoid激活函數
% 此處顯示詳細說明
y = 1./(1 + exp(-x));
end
function [y] = relu(x)
%relu 激活函數
% 此處顯示詳細說明
p = (x > 0);
y = x.*p;
end
建立深度網絡的結構
function [dnn,parameter] = creatnn(K)
%UNTITLED6 此處顯示有關此函數的摘要
% parameter 是結構體,包括參數:
% learning_rate: 學習率
% momentum: 動量系數,一般為0.5,0.9,0.99
% attenuation_rate: 衰減系數
% delta:穩定數值
% step: 步長 一般為 0.001
% method: 方法{'SGD','mSGD','nSGD','AdaGrad','RMSProp','nRMSProp','Adam'}
L = size(K.a,2);
for i = 1:L-1
dnn{i}.W = unifrnd(-sqrt(6/(K.a(i)+K.a(i+1))),sqrt(6/(K.a(i)+K.a(i+1))),K.a(i+1),K.a(i));
% dnn{i}.W = normrnd(0,0.1,K.a(i+1),K.a(i));
dnn{i}.function = K.f{i};
dnn{i}.b = 0.01*ones(K.a(i+1),1);
end
parameter.learning_rate = 0.01;
parameter.momentum = 0.9;
parameter.attenuation_rate = 0.9;
parameter.delta = 1e-6;
parameter.step = 0.001;
parameter.method = "SGD";
parameter.beta1 = 0.9;
parameter.beta2 = 0.999;
end
建構前向傳播函數
function [y, Y] = forwordprop(dnn,x)
%UNTITLED3 此處顯示有關此函數的摘要
% 此處顯示詳細說明
L = size(dnn,2);
m = size(x,2);
Y{1} = x;
for i = 1:L
z = dnn{i}.W*x + repmat(dnn{i}.b,1,m);
if dnn{i}.function == "relu"
y = relu(z);
end
if dnn{i}.function == "sigmoid"
y = sigmoid(z);
end
Y{i+1} = y;
x = y;
end
end
建構反向誤差傳播函數
function [dnn] = backprop(x,label,dnn,parameter)
%UNTITLED2 此處顯示有關此函數的摘要
% parameter 是結構體,包括參數:
% learning_rate: 學習率
% momentum: 動量系數,一般為0.5,0.9,0.99
% attenuation_rate: 衰減系數
% delta:穩定數值
% step: 步長 一般為 0.001
% method: 方法{'SGD','mSGD','nSGD','AdaGrad','RMSProp','nRMSProp','Adam'}
%
L = size(dnn,2)+1;
m = size(x,2);
[y, Y] = forwordprop(dnn,x);
g = -label./y + (1 - label)./(1 - y);
method = {"SGD","mSGD","nSGD","AdaGrad","RMSProp","nRMSProp","Adam"};
persistent global_step;
if isempty(global_step)
global_step = 0;
end
global_step = global_step + 1;
% fprintf("global_step %d\n",global_step)
global E;
E(global_step) = sum(sum(-label.*log(y)-(1 - label).*log(1 - y)))/m;
persistent V;
if isempty(V)
for i = 1:L-1
V{i}.vw = dnn{i}.W*0;
V{i}.vb = dnn{i}.b*0;
end
end
if parameter.method == method{1,1}
for i = L : -1 : 2
if dnn{i-1}.function == "relu"
g = g.*(Y{i} > 0);
end
if dnn{i-1}.function == "sigmoid"
g = g.*Y{i}.*(1 - Y{i});
end
dw = g*Y{i - 1}.'/m;
db = sum(g,2)/m;
g = dnn{i-1}.W'*g;
dnn{i-1}.W = dnn{i-1}.W - parameter.learning_rate*dw;
dnn{i-1}.b = dnn{i-1}.b - parameter.learning_rate*db;
end
end
if parameter.method == method{1,2}
for i = L : -1 : 2
if dnn{i-1}.function == "relu"
g = g.*(Y{i} > 0);
end
if dnn{i-1}.function == "sigmoid"
g = g.*Y{i}.*(1 - Y{i});
end
dw = g*Y{i - 1}.'/m;
db = sum(g,2)/m;
g = dnn{i-1}.W'*g;
V{i-1}.vw = parameter.momentum*V{i-1}.vw - parameter.learning_rate*dw;
V{i-1}.vb = parameter.momentum*V{i-1}.vb - parameter.learning_rate*db;
dnn{i-1}.W = dnn{i-1}.W + V{i-1}.vw;
dnn{i-1}.b = dnn{i-1}.b + V{i-1}.vb;
end
end
if parameter.method == method{1,3} % 未實作
for i = L : -1 : 2
if dnn{i-1}.function == "relu"
g = g.*(Y{i} > 0);
end
if dnn{i-1}.function == "sigmoid"
g = g.*Y{i}.*(1 - Y{i});
end
dw = g*Y{i - 1}.'/m;
db = sum(g,2)/m;
g = dnn{i-1}.W'*g;
V{i-1}.vw = parameter.momentum*V{i-1}.vw - parameter.learning_rate*dw;
V{i-1}.vb = parameter.momentum*V{i-1}.vb - parameter.learning_rate*db;
dnn{i-1}.W = dnn{i-1}.W + V{i-1}.vw;
dnn{i-1}.b = dnn{i-1}.b + V{i-1}.vb;
end
end
if parameter.method == method{1,4}
for i = L : -1 : 2
if dnn{i-1}.function == "relu"
g = g.*(Y{i} > 0);
end
if dnn{i-1}.function == "sigmoid"
g = g.*Y{i}.*(1 - Y{i});
end
dw = g*Y{i - 1}.'/m;
db = sum(g,2)/m;
g = dnn{i-1}.W'*g;
V{i-1}.vw = V{i-1}.vw + dw.*dw;
V{i-1}.vb = V{i-1}.vb + db.*db;
dnn{i-1}.W = dnn{i-1}.W - parameter.learning_rate./(parameter.delta + sqrt(V{i-1}.vw)).*dw;
dnn{i-1}.b = dnn{i-1}.b - parameter.learning_rate./(parameter.delta + sqrt(V{i-1}.vb)).*db;
end
end
if parameter.method == method{1,5}
for i = L : -1 : 2
if dnn{i-1}.function == "relu"
g = g.*(Y{i} > 0);
end
if dnn{i-1}.function == "sigmoid"
g = g.*Y{i}.*(1 - Y{i});
end
dw = g*Y{i - 1}.'/m;
db = sum(g,2)/m;
g = dnn{i-1}.W'*g;
V{i-1}.vw = parameter.attenuation_rate*V{i-1}.vw + (1 - parameter.attenuation_rate)*dw.*dw;
V{i-1}.vb = parameter.attenuation_rate*V{i-1}.vb + (1 - parameter.attenuation_rate)*db.*db;
dnn{i-1}.W = dnn{i-1}.W - parameter.learning_rate./sqrt(parameter.delta + V{i-1}.vw).*dw;
dnn{i-1}.b = dnn{i-1}.b - parameter.learning_rate./sqrt(parameter.delta + V{i-1}.vb).*db;
end
end
persistent s;
if parameter.method == method{1,7}
if isempty(s)
for i = 1:L-1
s{i}.vw = dnn{i}.W*0;
s{i}.vb = dnn{i}.b*0;
end
end
for i = L : -1 : 2
if dnn{i-1}.function == "relu"
g = g.*(Y{i} > 0);
end
if dnn{i-1}.function == "sigmoid"
g = g.*Y{i}.*(1 - Y{i});
end
dw = g*Y{i - 1}.'/m;
db = sum(g,2)/m;
g = dnn{i-1}.W'*g;
s{i-1}.vw = parameter.beta2*s{i-1}.vw + (1 - parameter.beta1)*dw;
s{i-1}.vb = parameter.beta2*s{i-1}.vb + (1 - parameter.beta1)*db;
V{i-1}.vw = parameter.beta2*V{i-1}.vw + (1 - parameter.beta2)*dw.*dw;
V{i-1}.vb = parameter.beta2*V{i-1}.vb + (1 - parameter.beta2)*db.*db;
dnn{i-1}.W = dnn{i-1}.W - parameter.learning_rate*(s{i-1}.vw/(1-parameter.beta1.^global_step))./(parameter.delta + sqrt(V{i-1}.vw./(1 - parameter.beta2.^global_step)));
dnn{i-1}.b = dnn{i-1}.b - parameter.learning_rate*(s{i-1}.vb/(1-parameter.beta1.^global_step))./(parameter.delta + sqrt(V{i-1}.vb./(1 - parameter.beta2.^global_step)));
end
end
end
- 注意,如果訓練過程中出現正确率很低,而且保持不變,應該考慮數值不穩定的問題,也就是出現了無窮小NaN,考慮在函數開始的g中添加數值穩定項。
好了,到這裡,網絡需要的函數都搭建完成了。下面開始建構一個雙隐層的前饋神經網絡,實作mnist資料集的識别。
clear all
load('mnist_uint8.mat');
test_x = (double(test_x)/255)';
train_x = (double(train_x)/255)';
test_y = double(test_y.');
train_y = double(train_y.');
K.f = {"relu","relu","relu","sigmoid"};
K.a = [784,400,300,500,10];
[net,P] = creatnn(K);
P.method = "RMSProp";
P.learning_rate = 0.001;
m = size(train_x,2);
batch_size = 100;
MAX_P = 2000;
global E;
for i = 1:MAX_P
q = randi(m,1,batch_size);
train = train_x(:,q);
label = train_y(:,q);
net = backprop(train,label,net,P);
if mod(i,50) == 0
[output,~] = forwordprop(net,train);
[~,index0] = max(output);
[~,index1] = max(label);
rate = sum(index0 == index1)/batch_size;
fprintf("第%d訓練包的正确率:%f\n",i,rate)
[output,~] = forwordprop(net,test_x);
[~,index0] = max(output);
[~,index1] = max(test_y);
rate = sum(index0 == index1)/size(test_x,2);
fprintf("測試集的正确率:%f\n",rate)
end
end
測試結果:
訓練誤差:
好了,大家可以自行搭建網絡了,如果遇到什麼問題可以下面留言哦!