TensorFlow 中的 LRNOp 與 Caffe 的差異:
- 直接使用平方和而不是像論文中一樣使用平方和的均值,是以算子的推薦參數有所不同;
- 僅支援 NHWC 格式的輸入;
- CPU 後端除架構源碼外還會調用 MKL;
- GPU 後端僅有 cuDNN 實作,在前後插入轉置操作。
sqr_sum[a, b, c, d] =
sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
output = input / (bias + alpha * sqr_sum) ** beta
下面對于架構内部的 CPU 實作進行介紹。
LRNOp
OP_REQUIRES_OK 在錯誤時列印檔案名、行号和錯誤碼。
OP_REQUIRES 接受表達式和傳回值。
FastBoundsCheck 檢查
depth_radius64
有沒有超出 int 的範圍。
OpKernel 是所有 Op 類的基類。
template <typename Device, typename T>
class LRNOp : public OpKernel {
public:
explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
int64_t depth_radius64;
OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
OP_REQUIRES(
context,
FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
errors::InvalidArgument("depth_radius = ", depth_radius64,
" larger than int max"));
depth_radius_ = static_cast<int>(depth_radius64);
float tmp;
OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
bias_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("alpha", &tmp));
alpha_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("beta", &tmp));
beta_ = T(tmp);
}
LRNOp::Compute
輸入次元要求是4維。
輸入元素數量在
int
的表示範圍内。
僅支援 NHWC 布局。
通道數量加填充也需要在
int
的表示範圍内。
void Compute(OpKernelContext* context) override {
const Tensor& in = context->input(0);
OP_REQUIRES(context, in.dims() == 4,
errors::InvalidArgument("in must be 4-dimensional"));
OP_REQUIRES(
context,
FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
errors::InvalidArgument("argument to LRN too large"));
// Cast to platform-specific int to avoid conversion warnings.
const int batch = static_cast<int>(in.dim_size(0));
const int rows = static_cast<int>(in.dim_size(1));
const int cols = static_cast<int>(in.dim_size(2));
const int depth = static_cast<int>(in.dim_size(3));
OP_REQUIRES(context,
(depth + depth_radius_) <= std::numeric_limits<int>::max(),
errors::InvalidArgument("depth ", depth, " + depth_radius ",
depth_radius_, " exceeds int max."));
OpKernelContext::allocate_output 為輸出配置設定記憶體。
LaunchLRN<CPUDevice, T> 為 CPU 後端的實作。
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(
0, TensorShape({batch, rows, cols, depth}), &output));
LaunchLRN<Device, T> launcher(depth_radius_, bias_, alpha_, beta_);
launcher.launch(context, this, in, output);
}
private:
int depth_radius_;
T bias_;
T alpha_;
T beta_;
};
LaunchLRN<CPUDevice, T>
LaunchLRN::launch
beta_
是特定值時且通道數量大于 kSingleThreadedLRNDepthCutoff 才會調用 LaunchLRN::SingleThreadedLRN 來實作。
template <typename T>
struct LaunchLRN<CPUDevice, T> {
LaunchLRN(int depth_radius, T bias, T alpha, T beta)
: depth_radius_(depth_radius), bias_(bias), alpha_(alpha), beta_(beta) {}
void launch(OpKernelContext* context, OpKernel* kernel, const Tensor& in,
Tensor* output) {
const int batch = static_cast<int>(in.dim_size(0));
const int rows = static_cast<int>(in.dim_size(1));
const int cols = static_cast<int>(in.dim_size(2));
const int depth = static_cast<int>(in.dim_size(3));
#if defined(IS_MOBILE_PLATFORM)
SingleThreadedLRN(in, batch, rows, cols, depth, output);
#else
if (depth > kSingleThreadedLRNDepthCutoff &&
(beta_ == T(0.5) || beta_ == T(1))) {
SingleThreadedLRN(in, batch, rows, cols, depth, output);
return;
}
nodes
為特征圖的元素數量。
Tensor::shaped 将現有 Tensor 中的資料映射到一個對應次元和類型的 Eigen::TensorMap。
将輸入轉為二維矩陣
in_shaped
。
const int nodes = cols * rows;
auto in_shaped = in.shaped<T, 2>({nodes * batch, depth});
multiplier
是一個二維的 Eigen::Tensor。
GetBandMatrix 生成帶狀矩陣。将輸入與帶矩陣相乘,可減少沿深度方向的适合分片數量。
// Multiplying the input with the band matrix has the effect of reducing the
// correct patch along the depth.
Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
GetBandMatrix<T>(depth, depth_radius_, &multiplier);
将輸出轉為二維矩陣
out_shaped
。
Eigen::Tensor::contract 在二維情況下是矩陣乘。
in_shaped
在平方後與
multiplier
矩陣相乘實作求和,得到 b i a s + α ( ∑ j , i − n i + n x j 2 ) bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2) bias+α(∑j,i−ni+nxj2)。
auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
激活的計算開銷較大是以對特殊情況進行優化:
-
:對beta_=1
取倒數;tmp
-
:則直接使用beta_=0.5
函數;rsqrt
- 其他:輸入乘以模值。
if (beta_ == T(1)) {
out_shaped.device(context->eigen_cpu_device()) =
in_shaped * tmp.inverse();
} else if (beta_ == T(0.5)) {
out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
} else {
out_shaped.device(context->eigen_cpu_device()) =
in_shaped * (tmp.log() * -beta_).exp();
}
#endif
}
private:
typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
LaunchLRN::SingleThreadedLRN
Eigen::Map 映射現有資料數組的矩陣或向量表達式。
Eigen::Matrix 有六個模闆參數,一般隻需指定前三個參數。
将
in
映射為二維矩陣。由于 Eigen 矩陣預設為列優先存儲順序,是以不需要進行轉置。
向量表達式的大小 為
depth
,跨度為
batch * rows * cols
。
padded_square
的大小為
depth+depth_radius_ * 2
DenseBase::setZero
void SingleThreadedLRN(const Tensor& in, const int batch, const int rows,
const int cols, const int depth, Tensor* out) {
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> data_in(
in.flat<T>().data(), depth, batch * rows * cols);
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> data_out(
out->flat<T>().data(), depth, batch * rows * cols);
const int double_depth_radius = depth_radius_ * 2;
Eigen::Matrix<T, Eigen::Dynamic, 1> padded_square(data_in.rows() +
double_depth_radius);
padded_square.setZero();
每次取一個空間位置上的所有通道,求平方;滑動視窗求和。需要 N H W × C NHW \times C NHW×C 兩層循環。僅在求平方時存在向量化。
塊操作跳過
padded_square
的前
depth_radius_
行,存儲 α x 2 \alpha x^2 αx2。
accumulated_scale
為目前位置上的 α ( ∑ j , i − n i + n x j 2 \alpha(\sum_{j,{i - n}}^{i + n} x_j^2 α(∑j,i−ni+nxj2。
data_out
中為 N = ( b i a s + α ( ∑ j , i − n i + n x j 2 ) ) N = (bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2)) N=(bias+α(∑j,i−ni+nxj2))
for (int r = 0; r < data_in.cols(); ++r) {
// Do local response normalization for data_in(:, r). First, compute the
// square and store them in buffer for repeated use.
padded_square.block(depth_radius_, 0, data_out.rows(), 1) =
data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_;
// Then, compute the scale and write it to data_out.
T accumulated_scale(0);
for (int i = 0; i < double_depth_radius; ++i) {
accumulated_scale += padded_square(i);
}
for (int i = 0; i < data_in.rows(); ++i) {
accumulated_scale += padded_square(i + double_depth_radius);
data_out(i, r) = bias_ + accumulated_scale;
accumulated_scale -= padded_square(i);
}
}
根據不同情況計算激活。
if (beta_ == T(1)) {
data_out.array() = data_in.array() * data_out.array().inverse();
} else if (beta_ == T(0.5)) {
data_out.array() = data_in.array() * data_out.array().rsqrt();
} else {
data_out.array() =
data_in.array() * (data_out.array().log() * -beta_).exp();
}
}
int depth_radius_;
T bias_;
T alpha_;
T beta_;
};
GetBandMatrix
沿着對角線上大小為(2*深度_半徑+1)的掃描線,建立一個逐深的帶狀矩陣,其中有1個。
沿對角線周圍的一條尺寸為( 2 深度_半徑+ 1)的條帶建立一個深度為1s的深度帶狀矩陣。
沿對角線周圍的大小(2depth_radius+1)建立一個帶1的逐深度帶矩陣。
// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
// depth_radius + 1) around the diagonal.
template <typename T>
void GetBandMatrix(int depth, int depth_radius,
Eigen::Tensor<T, 2, Eigen::RowMajor>* result) {
result->setZero();
for (int row = 0; row < depth; ++row) {
const int begin = std::max<int>(0, row - depth_radius);
const int end = std::min<int>(depth, row + depth_radius + 1);
Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin);
Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin);
result->slice(start, sizes).setConstant(T(1));
}
}
LRNGradOp
template <typename Device, typename T>
class LRNGradOp : public OpKernel {
public:
explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
int64_t depth_radius64;
OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
OP_REQUIRES(
context,
FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
errors::InvalidArgument("depth_radius = ", depth_radius64,
" larger than int max"));
depth_radius_ = static_cast<int>(depth_radius64);
float tmp;
OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
bias_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("alpha", &tmp));
alpha_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("beta", &tmp));
beta_ = T(tmp);
}
LRNGradOp::Compute
建立一個 LaunchLRNGrad 對象并調用。
void Compute(OpKernelContext* context) override {
const Tensor& in_grads = context->input(0);
const Tensor& in_image = context->input(1);
const Tensor& out_image = context->input(2);
OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4,
errors::InvalidArgument("inputs must be 4-dimensional"));
const int64_t batch = in_grads.dim_size(0);
const int64_t rows = in_grads.dim_size(1);
const int64_t cols = in_grads.dim_size(2);
const int64_t depth = in_grads.dim_size(3);
OP_REQUIRES(
context,
in_image.dim_size(0) == batch && in_image.dim_size(1) == rows &&
in_image.dim_size(2) == cols && in_image.dim_size(3) == depth &&
out_image.dim_size(0) == batch && out_image.dim_size(1) == rows &&
out_image.dim_size(2) == cols && out_image.dim_size(3) == depth &&
out_image.dims() == 4,
errors::InvalidArgument(
"input_grads, input_image, and out_image should have the same "
"shape"));
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(
0, TensorShape({batch, rows, cols, depth}), &output));
LaunchLRNGrad<Device, T> launcher(depth_radius_, bias_, alpha_, beta_);
launcher.launch(context, this, in_grads, in_image, out_image, output);
}
private:
int depth_radius_;
T bias_;
T alpha_;
T beta_;
};
LaunchLRNGrad
template <typename T>
struct LaunchLRNGrad<CPUDevice, T> {
LaunchLRNGrad(int depth_radius, T bias, T alpha, T beta)
: depth_radius_(depth_radius),
bias_(bias),
alpha_(alpha),
beta_(beta),
alpha_beta_2_(T(-2) * alpha * beta) {}
LaunchLRNGrad::launch
void launch(OpKernelContext* context, OpKernel* kernel,
const Tensor& in_grads, const Tensor& in_image,
const Tensor& out_image, Tensor* output) {
const int64_t batch = in_grads.dim_size(0);
const int64_t rows = in_grads.dim_size(1);
const int64_t cols = in_grads.dim_size(2);
const int64_t depth = in_grads.dim_size(3);
const auto nodes = cols * rows;
auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
out_shaped.setZero();
y i = x i ( b i a s + α ( ∑ j , i − n i + n x j 2 ) ) β y_i = \frac{x_i}{(bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2))^\beta} yi=(bias+α(∑j,i−ni+nxj2))βxi
令
N = ( b i a s + α ( ∑ j , i − n i + n x j 2 ) ) = ( x i y i ) 1 β N = (bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2)) = (\frac{x_i}{y_i})^{\frac{1}{\beta}} N=(bias+α(j,i−n∑i+nxj2))=(yixi)β1
N − β = y i x i N^{-{\beta}} = \frac{y_i}{x_i} N−β=xiyi
則
d y i d x i = N β − x i β ⋅ N β − 1 ⋅ 2 α x i N 2 β = 1 − 2 α β N − 1 ⋅ x i x i N β d y i d x j = − x i β ⋅ N β − 1 ⋅ 2 α x j N 2 β = − 2 α β ⋅ N − 1 ⋅ x i x j N β \begin{aligned} \frac{dy_i}{dx_i} &= \frac{N^\beta - x_i\beta\cdot N^{\beta-1}\cdot 2\alpha x_i}{N^{2\beta}}\\ &= \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_i}{N^{\beta}}\\ \frac{dy_i}{dx_j} &= \frac{ - x_i\beta \cdot N^{\beta-1}\cdot 2\alpha x_j}{N^{2\beta}}\\ &= \frac{ - 2\alpha\beta \cdot N^{-1}\cdot x_i x_j}{N^{\beta}} \end{aligned} dxidyidxjdyi=N2βNβ−xiβ⋅Nβ−1⋅2αxi=Nβ1−2αβN−1⋅xixi=N2β−xiβ⋅Nβ−1⋅2αxj=Nβ−2αβ⋅N−1⋅xixj
将 d y dy dy 修改為兩維。
對于每一個輸出元素,
norm
為 N N N,
pre_computed_pow
為 N − β N^{-\beta} N−β。
activations_ab2
為 − 2 α β y i -2\alpha\beta y_i −2αβyi
d E d x i = d E d y i d y i d x i = d y i 1 − 2 α β N − 1 ⋅ x i x j N β = g s ⋅ − 2 α β x i y i N \begin{aligned} \frac{dE}{dx_i} &= \frac{dE}{dy_i} \frac{dy_i}{dx_i}\\ &= dy_i \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_j}{N^{\beta}} \\ &= gs \cdot \frac{-2\alpha\beta x_i y_i}{N} \end{aligned} dxidE=dyidEdxidyi=dyiNβ1−2αβN−1⋅xixj=gs⋅N−2αβxiyi
是以 y i = N − β ⋅ x j y_i = N^{-{\beta}}\cdot x_j yi=N−β⋅xj。
求 N N N 時需要求和。
auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
depth](int64_t begin, int64_t end) {
for (int64_t i = begin; i < end; ++i) {
for (int64_t j = 0; j < depth; ++j) {
// Let y be the LRN activations and x be the inputs along the depth
// dimension. (LRN operates independently along rows, cols, and
// batch).
// We have
// yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
// x_j^2))^beta
//
// Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
// x_j^2))
// dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta)
// dy_i/dx_j = ( - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta)
//
// NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta).
// However, this is numerically unstable for small values of xi. We
// compute N explicitly here to avoid that.
T gs = grads_shaped(i, j);
if (gs == T(0)) continue;
int64_t depth_begin = std::max<int64_t>(0, j - depth_radius_);
int64_t depth_end = std::min<int64_t>(depth, j + depth_radius_ + 1);
T norm(0);
for (int64_t k = depth_begin; k < depth_end; ++k) {
norm += in_shaped(i, k) * in_shaped(i, k);
}
norm = alpha_ * norm + bias_;
DCHECK_GT(norm, T(1e-6));
T pre_computed_pow = Eigen::numext::pow(norm, -beta_);
T activations_ab2 = alpha_beta_2_ * activations(i, j);
for (int64_t k = depth_begin; k < depth_end; ++k) {
T dyi = in_shaped(i, k) * activations_ab2 / norm;
if (k == j) {
dyi += pre_computed_pow;
}
dyi *= gs;
const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
}
}
}
};
Shard 使用線程池執行函數。
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
depth * depth, shard);
}
int depth_radius_;
T bias_;
T alpha_;
T beta_;
T alpha_beta_2_;
};
參考資料:
- Add gpu support for LRN #211
- [譯]TF-api(2) tf.nn.lrn
- tf.nn.local_response_normalization
- 【TensorFlow】tf.nn.local_response_normalization詳解,lrn正則法如何計算?
- torch.nn.LocalResponseNorm
- onnx2pytorch和onnx-simplifer新版介紹
- onnx/docs/Operators.md
- Tensorflow是如何注冊和調用C++ New Op的
- 建立運算
- tensorflow源碼剖析之framework-kernel
- 『深度長文』Tensorflow代碼解析(三)
- TensorFlow 源碼閱讀[1] OpKernel的注冊
- Tensorflow c++ 實踐及各種坑
- Eigen Tensors
- TensorFlow源碼分析——Tensor與Eigen
- 從零開始編寫深度學習庫(四)Eigen::Tensor學習使用及代碼重構
- std::vetcor到Eigen::Tensor再到Tensorflow::Tensor的轉換
- tensorflow源碼筆記(Eigen3部分)
- Creating tensorflow::Tensor from Eigen::Tensor
- Eigen 高維矩陣運算
- Using Eigen in CUDA kernels
- TensorFlow 測試最佳做法
- 【問題分析】Tensorflow Unit Test與Benchmark Test
- Catalog of coefficient-wise math functions
- 針對深度學習的GPU共享
- 如何讓Eigen代碼跑在GPU/線程池上1
- 小白的tensorflow+CUDA程式設計踩坑記錄
- 工作中 C++ 泛型程式設計用的多嗎?
- C++ Primer - 模闆與泛型程式設計
- 【C++ Primer】模闆與泛型程式設計
- Eigen庫中的Map類到底是做什麼的?
- Use TensorFlow with the SageMaker Python SDK
- [CPU] Best CPU Build found with MKL config in bazel build BUT with MKL Disabled in script, especially on LSTM, can’t undestand why #38858
- looking for source code of from gen_nn_ops in tensorflow
- tensorflow代碼解析——概覽
- 最大限度提升 CPU 上的 TensorFlow* 性能:推理工作負載的注意事項和建議
- tf.math.log
- Block operations
- Array類和元素級操作