引言
本文主要以目标檢測算法CenterNet作為載體,介紹可變形卷積的算法原理,python實作以及工程化部署
可變形卷積原理介紹
可變形卷積目前有DCN V1以及在V1基礎上改進發展來的DCN V2,具體的算法原理可以看論文原文或者參考:
https://cloud.tencent.com/developer/article/1679389
可變形卷積的實作
這裡主要參考:https://cloud.tencent.com/developer/article/1638363 中的講解
CenterNet中實作DCN V2的工程化部署
這裡主要參考的代碼為:https://github.com/CaoWGG/TensorRT-CenterNet
這裡參考:
- 利用TensorRT對深度學習進行加速;
- 利用TensorRT實作神經網絡提速(讀取ONNX模型并運作)
- 實作TensorRT自定義插件(plugin)自由
有幾個點需要事先說明:
- 為了能夠實作對于模型的量化,需要将CenterNet模型先轉換為ONNX,然後利用tensorRT官方公布的onnx-tensorrt庫來實作tensorRT模型的轉化
- 由于TensorRT中不支援可變形卷積的操作,是以需要自定義plugin來進行實作
CenterNet中DCNV2的plugin定義在onnx-tensorrt檔案夾中,分别為:
dcn_v2_im2col_cuda.cu和dcn_v2_im2col_cuda.h;DCNv2.cpp和DCNv2.h,其中DCNv2.cpp和DCNv2.h為tensorRT中自定義plugin的檔案,具體代碼如下:
DCNv2Plugin::DCNv2Plugin(int in_channel,
int out_channel,
int kernel_H,
int kernel_W,
int deformable_group,
int dilation,
int groups,
int padding,
int stride,
nvinfer1::Weights const &weight, nvinfer1::Weights const &bias):_in_channel(in_channel),
_out_channel(out_channel),_kernel_H(kernel_H),_kernel_W(kernel_W),_deformable_group(deformable_group),
_dilation(dilation),_groups(groups),_padding(padding),_stride(stride),_initialized(false){
if (weight.type == nvinfer1::DataType::kFLOAT)
{
_h_weight.assign((float*)weight.values,(float*)weight.values+weight.count);
} else { throw std::runtime_error("Unsupported weight dtype");}
if (bias.type == nvinfer1::DataType::kFLOAT)
{
_h_bias.assign((float*)bias.values,(float*)bias.values+bias.count);
} else { throw std::runtime_error("Unsupported bias dtype");}
}
// 初始化函數,為參數提前開辟顯存空間
int DCNv2Plugin::initialize() {
if(_initialized) return 0;
auto _output_dims = this->getOutputDimensions(0, &this->getInputDims(0), 3);
assert(is_CHW(this->getInputDims(0)));
assert(is_CHW(_output_dims));
size_t ones_size = _output_dims.d[1]*_output_dims.d[2]* sizeof(float);
size_t weight_size = _h_weight.size()* sizeof(float);
size_t bias_size = _h_bias.size()* sizeof(float);
float *ones_cpu = new float[ones_size/ sizeof(float)];
for (int i = 0; i < ones_size/ sizeof(float); i++) {
ones_cpu[i] = 1.0;
}
CHECK_CUDA(cudaMalloc((void**)&_d_columns, _in_channel * _kernel_H * _kernel_W * ones_size););
CHECK_CUDA(cudaMalloc((void**)&_d_ones, ones_size));
CHECK_CUDA(cudaMalloc((void**)&_d_weight, weight_size));
CHECK_CUDA(cudaMalloc((void**)&_d_bias, bias_size));
CHECK_CUDA(cudaMemcpy(_d_ones, ones_cpu, ones_size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(_d_weight, _h_weight.data(), weight_size, cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(_d_bias, _h_bias.data(), bias_size, cudaMemcpyHostToDevice));
delete[] ones_cpu;
_initialized = true;
return 0;
}
// 用于釋放之前申請的顯存空間
void DCNv2Plugin::terminate() {
if (!_initialized) {
return;
}
cudaFree(_d_columns);
cudaFree(_d_bias);
cudaFree(_d_weight);
cudaFree(_d_ones);
_initialized = false;
}
DCNv2Plugin::~DCNv2Plugin() {
terminate();
}
// 判斷資料類型是否正确
bool DCNv2Plugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
return (type == nvinfer1::DataType::kFLOAT);
}
// TensorRT支援動态tensor大小的時候,batch的次元需要用下面的函數定義
nvinfer1::Dims DCNv2Plugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) {
assert(index == 0);
assert(inputDims);
assert(nbInputs == 3);
nvinfer1::Dims const& input = inputDims[0];
assert(is_CHW(input));
nvinfer1::Dims output;
output.nbDims = input.nbDims;
for( int d=0; d<input.nbDims; ++d ) {
output.type[d] = input.type[d];
output.d[d] = input.d[d];
}
output.d[0] = _out_channel;
output.d[1] = (output.d[1] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
output.d[2] = (output.d[2] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
return output;
}
// 傳回該plugin需要中間顯存變量的實際資料大小
size_t DCNv2Plugin::getWorkspaceSize(int maxBatchSize) const {
return 0;
}
// 該plugin定義的op的執行函數
int DCNv2Plugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
cudaStream_t stream) {
float alpha ,beta;
int m, n, k;
cublasHandle_t handle = blas_handle();
const float* input = static_cast<const float *>(inputs[0]);
const float* offset = static_cast<const float *>(inputs[1]);
const float* mask = static_cast<const float *>(inputs[2]);
float * output = static_cast<float *>(outputs[0]);
nvinfer1::Dims input_dims = this->getInputDims(0);
assert(batchSize==1);
int h = input_dims.d[1];
int w = input_dims.d[2];
int height_out = (h + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1;
int width_out = (w + 2 * _padding - (_dilation * (_kernel_W - 1) + 1)) / _stride + 1;
m = _out_channel;
n = height_out * width_out;
k = 1;
alpha = 1.0;
beta = 0.0;
/// output nxm
/// ones 1xn T ->> nx1
/// bias 1xm
/// ones x bias = nxm
// add bias
cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
n, m, k,&alpha,
_d_ones, k,
_d_bias, k,&beta,
output, n);
// im2col (offset and mask)
modulated_deformable_im2col_cuda(stream,input,offset,mask,
1, _in_channel, h, w,
height_out, width_out, _kernel_H, _kernel_W,
_padding, _padding, _stride, _stride, _dilation, _dilation,
_deformable_group, _d_columns);
m = _out_channel;
n = height_out * width_out;
k = _in_channel * _kernel_H * _kernel_W;
alpha = 1.0;
beta = 1.0;
// im2col conv
cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
n, m, k,&alpha,
_d_columns, n,
_d_weight, k,
&beta,
output, n);
return 0;
}
然後需要将以上plugin定義好的op在onnx-tensorrt中builtin_op_importers.cpp檔案中進行插件的注冊操作:
DEFINE_BUILTIN_OP_IMPORTER(DCNv2) {
ASSERT(inputs.at(0).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // input
ASSERT(inputs.at(1).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // offset
ASSERT(inputs.at(2).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // mask
ASSERT(inputs.at(3).is_weights(), ErrorCode::kUNSUPPORTED_NODE); // weight
auto kernel_weights = inputs.at(3).weights();
nvinfer1::Weights bias_weights;
if( inputs.size() == 5 ) {
ASSERT(inputs.at(4).is_weights(), ErrorCode::kUNSUPPORTED_NODE);
auto shaped_bias_weights = inputs.at(4).weights();
ASSERT(shaped_bias_weights.shape.nbDims == 1, ErrorCode::kINVALID_NODE);
ASSERT(shaped_bias_weights.shape.d[0] == kernel_weights.shape.d[0], ErrorCode::kINVALID_NODE);
bias_weights = shaped_bias_weights;
} else {
bias_weights = ShapedWeights::empty(kernel_weights.type);
}
int out_channel,in_channel,kernel_H,kernel_W,deformable_group,dilation,groups,padding,stride;
out_channel = kernel_weights.shape.d[0];
in_channel = kernel_weights.shape.d[1];
kernel_H = kernel_weights.shape.d[2];
kernel_W = kernel_weights.shape.d[3];
OnnxAttrs attrs(node);
deformable_group = attrs.get("deformable_group", 1);
dilation = attrs.get("dilation", 1);
groups = attrs.get("groups", 1);
padding = attrs.get("padding", 1);
stride = attrs.get("stride", 1);
RETURN_FIRST_OUTPUT(
ctx->addPlugin(
new DCNv2Plugin(in_channel,out_channel,kernel_H,kernel_W,deformable_group,
dilation,groups,padding,stride, kernel_weights, bias_weights),
{&inputs.at(0).tensor(),&inputs.at(1).tensor(),&inputs.at(2).tensor()}));
}
在builtin_plugins.cpp也進行注冊:
REGISTER_BUILTIN_PLUGIN("DCNv2", DCNv2Plugin);
然後需要在onnx-tensorrt中的CMakeLists.txt裡添加上定義plugin對應的源碼并将其連結到動态庫中:
# 定義插件源碼
set(PLUGIN_SOURCES
FancyActivation.cu
ResizeNearest.cu
Split.cu
dcn_v2_im2col_cuda.cu
InstanceNormalization.cpp
DCNv2.cpp
plugin.cpp
)
# 連結到動态庫
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC --expt-extended-lambda")
if(${CMAKE_VERSION} VERSION_LESS ${CMAKE_VERSION_THRESHOLD})
CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
CUDA_ADD_LIBRARY(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
else()
include_directories(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
add_library(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
endif()
target_include_directories(nvonnxparser_plugin PUBLIC ${CUDA_INCLUDE_DIRS} ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
target_link_libraries(nvonnxparser_plugin ${TENSORRT_LIBRARY} cuda cudart cublas)
最後在連結以上生成的庫來進行tensort标準C++ API函數進行前向推理以及模型轉換操作