此文章僅為本人的學習筆記,侵權删。
視訊位址: 【尚學堂】AI人工智能PyTorch深度學習進階教程_PyTorch反向傳播推導_代碼實作神經網絡算法_PyTorch神經網絡_PyTorch深度學習課程
參考文章:
-
“反向傳播算法”過程及公式推導(超直覺好懂的Backpropagation)
這講了正向傳播和反向傳播的具體過程。
-
反向傳播算法(過程及公式推導)
這篇文章講了反向傳播算法的推導
代碼練習:
資料集:資料
Python實作神經網絡完成手寫數字識别任務
激活函數是relu,輸出層是softmax分類
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.utils.extmath import safe_sparse_dot
# 由于輸入層需要10個節點,是以最好把目标數字0-9做成One Hot編碼的形式
def tran_y(y_true):
y_ohe = np.zeros(10)
y_ohe[int(y_true)] = 1
return y_ohe
mnist = fetch_mldata('MNIST original', data_home='data/for_my_own_nn_data/')
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)
y = np.array([tran_y(y[i]) for i in range(len(y))])
hidden_layer_sizes = [300, 100] #隐藏層個數:一個300,一個100
max_iter = 200
alpha = 0.0001 # 正則項系數
learning_rate = 0.001
(70000, 784)
(70000,)
def log_loss(y_true, y_prob): #交叉熵
"""
計算logistic loss對于分類任務
"""
y_prob = np.clip(y_prob, 1e-10, 1 - 1e-10)
if y_prob.shape[1] == 1:
y_prob = np.append(1 - y_prob, y_prob, axis=1)
if y_true.shape[1] == 1:
y_true = np.append(1 - y_true, y_true, axis=1)
return -np.sum(y_true * np.log(y_prob)) / y_prob.shape[0]
def softmax(x): #實作softmax非線性變換
tmp = x - x.max(axis=1)[:, np.newaxis]
np.exp(tmp, out=x)
x /= x.sum(axis=1)[:, np.newaxis]
return x
def relu(x):
np.clip(x, 0, np.finfo(x.dtype).max, out=x)
return x
def relu_derivative(z, delta):
"""
實作relu的導數
"""
delta[z == 0] = 0#z==0執行,不等于0不執行
def gen_batches(n, bs):
"""
産生一個批次的樣本資料的索引
:param n: 樣本總數
:param bs: batch_size批大小
:return: 一個批次樣本的索引
"""
start = 0
for _ in range(int(n // bs)):
end = start + bs
yield slice(start, end)
start = end
if start < n:
yield slice(start, n)
n_samples, n_features = X.shape
n_outputs = y.shape[1]
batch_size = min(200, n_samples)
layer_units = ([n_features] + hidden_layer_sizes + [n_outputs])
#神經元個數:輸入層神經元:[n_features] 隐藏:hidden_layer_sizes 輸出層:[n_outputs]
n_layers = len(layer_units)#共有多少層
# 初始化W和b
coefs_ = []
intercepts_ = []
for i in range(n_layers - 1):#初始化各層的w與b
fan_in = layer_units[i]
fan_out = layer_units[i + 1]
# 推薦的初始化方法Xavier Glorot.論文得出
factor = 6.
init_bound = np.sqrt(factor / (fan_in + fan_out))
coef_init = np.random.uniform(-init_bound, init_bound, (fan_in, fan_out))
intercept_init = np.random.uniform(-init_bound, init_bound, fan_out)
coefs_.append(coef_init)
intercepts_.append(intercept_init)
# 初始化一些集合用于存放正向傳播層層結果,反向傳播層層梯度結果和中間資料deltas
# activations存放每一層的輸出,輸入層就是X,其它層先建構好輸出的形狀
activations = [X]
activations.extend(np.empty((batch_size, n_fan_out)) for n_fan_out in layer_units[1:]) #每層設定一個空矩陣來存儲結果
# 求梯度時候必要的一部分,grads = a*deltas
deltas = [np.empty_like(a_layer) for a_layer in activations]#每層設定一個空矩陣來存儲結果
# 初始化層與層之間的W矩陣對應計算出來的gradients
coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_, n_fan_out_
in zip(layer_units[:-1], layer_units[1:])]
# 初始化隐藏層到輸出層對應的bias對應計算出來的gradients
intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in
layer_units[1:]]
loss_ = 0.0
# mini batch 梯度下降
for it in range(max_iter):#輪次
arr = np.arange(n_samples)
np.random.shuffle(arr)
X = X[arr]
y = y[arr]
accumulated_loss = 0.0
for batch_slice in gen_batches(n_samples, batch_size):#批次
batch_X = X[batch_slice]
batch_y = y[batch_slice]
# 指派輸入層資料
activations[0] = batch_X
# 正向傳播
for i in range(n_layers - 1):
activations[i + 1] = safe_sparse_dot(activations[i], coefs_[i])
activations[i + 1] += intercepts_[i]
# 對于隐藏層
if (i + 1) != (n_layers - 1):
activations[i + 1] = relu(activations[i + 1])
# 對于最後一層輸出層
activations[i + 1] = softmax(activations[i + 1])
# 計算平均Loss
loss = log_loss(batch_y, activations[-1])
# 給Loss添加L2正則項
values = np.sum(np.array([np.dot(s.ravel(), s.ravel()) for s in coefs_]))
loss += (0.5 * alpha) * values / len(batch_y)
accumulated_loss += loss * len(batch_y)
# 反向傳播
# last是指反向傳播從後面第一個要開始計的算層的索引号
last = n_layers - 2
# 這裡計算delta[last]适用于輸出非線性變換和損失函數的組合:
# softmax and categorical cross entropy
# 為了去計算倒數第一個W矩陣的梯度,先計算last對應的deltas
deltas[last] = activations[-1] - batch_y
# 計算倒數第一個W矩陣的梯度,即從輸出層傳回過來的梯度
# 1,base loss對應的梯度
coef_grads[last] = safe_sparse_dot(activations[last].T, deltas[last])
# 2,L2 loss對應的梯度
coef_grads[last] += (alpha * coefs_[last])
# 3,梯度求平均
coef_grads[last] /= n_samples
# 4,截距項,base loss對應的梯度
intercept_grads[last] = np.mean(deltas[last], 0)
# 疊代計算各個隐藏層前面的W矩陣對應的梯度
for i in range(n_layers - 2, 0, -1):
# deltas_previous = deltas * W * 激活函數的導
deltas[i - 1] = safe_sparse_dot(deltas[i], coefs_[i].T)
# 應用上激活函數relu的導
relu_derivative(activations[i], deltas[i - 1])
# 計算每個隐藏層前面的W矩陣的梯度
# 1,base loss對應的梯度
coef_grads[i - 1] = safe_sparse_dot(activations[i - 1].T, deltas[i - 1])
# 2,L2 loss對應的梯度
coef_grads[i - 1] += (alpha * coefs_[i - 1])
# 3,梯度求平均
coef_grads[i - 1] /= n_samples
# 4,截距項,base loss對應的梯度
intercept_grads[i - 1] = np.mean(deltas[i - 1], 0)
# 梯度下降更新參數
# 這裡的+号不是numpy數組之間的計算,而是python原生的list清單拼接
# grads: list, length = len(coefs_) + len(intercepts_)
grads = coef_grads + intercept_grads
updates = [-learning_rate * grad for grad in grads]
# Wt+1 = Wt - learning_rate * grad
# params: list, length = len(coefs_) + len(intercepts_)
params = coefs_ + intercepts_
for param, update in zip(params, updates):
param += update
loss_ = accumulated_loss / X.shape[0]
print("Iteration %d, loss = %.8f" % (it, loss_))
# TO DO :
# 連續10次loss變換幅度小于門檻值, break跳出
# TO DO :
# 一次疊代後調整一次學習率
# TO DO :
# train_test_split評估一下測試集準确率
Iteration 0, loss = 16.41843381
Iteration 1, loss = 12.05028002
Iteration 2, loss = 9.59518521
Iteration 3, loss = 8.07381126
Iteration 4, loss = 7.05051051
Iteration 5, loss = 6.34085800
Iteration 6, loss = 5.80947574
Iteration 7, loss = 5.39779761
Iteration 8, loss = 5.06627548
Iteration 9, loss = 4.78721871
Iteration 10, loss = 4.55101589
Iteration 11, loss = 4.35204685
Iteration 12, loss = 4.17534207
Iteration 13, loss = 4.02260116
Iteration 14, loss = 3.88357171
Iteration 15, loss = 3.76303320
Iteration 16, loss = 3.65209822
Iteration 17, loss = 3.55253729
Iteration 18, loss = 3.46278989
Iteration 19, loss = 3.37840236
Iteration 20, loss = 3.30269155
Iteration 21, loss = 3.23202948
Iteration 22, loss = 3.16538411
Iteration 23, loss = 3.10627120
Iteration 24, loss = 3.04794362
Iteration 25, loss = 2.99438211
Iteration 26, loss = 2.94299062
Iteration 27, loss = 2.89611537
Iteration 28, loss = 2.85032957
Iteration 29, loss = 2.80667936
Iteration 30, loss = 2.76552825
Iteration 31, loss = 2.72595690
Iteration 32, loss = 2.68790869
Iteration 33, loss = 2.65275958
Iteration 34, loss = 2.61760695
Iteration 35, loss = 2.58602208
Iteration 36, loss = 2.55289127
Iteration 37, loss = 2.52326688
Iteration 38, loss = 2.49402990