DeepStream + tensorRT使用openpose实现端到端的人体姿态输出
1、DeepStream简介
这里有DS的详细介绍,可以看下
https://cloud.tencent.com/developer/article/1457105
2、TensorRT简介
这里有TRT的详细介绍,可以看下
https://www.cnblogs.com/gezhuangzhuang/p/11661924.html
3、OpenPose简介
这里有openPose的详细介绍,可以看下
https://github.com/CMU-Perceptual-Computing-Lab/openpose
效果图如下:
![](https://img.laitimes.com/img/_0nNw4CM6IyYiwiM6ICdiwiIyVGduV2YfNWawNiZpdmL4IDNzUjN1ETMyIzMwAjMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.gif)
3、目标:将OpenPose利用TensorRT加速并利用DeepStream实现端到端
3/1、OpenPose经过TensorRT加速
Github上有将OpenPose经过TensorRT加速的实现,可以运行demo查看效果
效果如下:
**GitHub地址:**https://github.com/zerollzeng/tensorrt-zoo
3/2 、将OpenPose模型加载到DeepStream
因为DeepStream中没有实现OpenPose,所以需要自己通过插件实现。需要解决的问题主要有两个:1、模型转换为Engine保存下来; 2、通过CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE实现对推理后张量的解析。
3/2/1、模型转换为Engine保存下来
对https://github.com/zerollzeng/tensorrt-zoo产生的模型进行保存即可;如果需要每次运行都产生Engine,可以通过NvDsInferCudaEngineGet接口获取Engine,当然这里需要进行适当的接口编写:
nvdsinfer_openpose_engine.cpp 完整代码
/*
* @Description: In User Settings Edit
* @Author: your name
* @Date: 2019-08-21 09:45:10
* @LastEditTime: 2019-12-04 19:27:27
* @LastEditors: zerollzeng
*/
#include "OpenPose.hpp"
#include "opencv2/opencv.hpp"
#include "nvdsinfer_custom_impl.h"
#include "nvdsinfer_context.h"
#include <algorithm>
#include <vector>
#include <string>
#include "time.h"
extern "C"
bool NvDsInferCudaEngineGet(nvinfer1::IBuilder *builder,
NvDsInferContextInitParams *initParams,
nvinfer1::DataType dataType,
nvinfer1::ICudaEngine *& cudaEngine)
{
printf("GO IN OPENPOSE\n");
const std::string prototxt = "/home/ubuntu/deepstream_sdk_v4.0.2_x86_64/samples/models/body_25/pose_deploy.prototxt";
const std::string caffemodel = "/home/ubuntu/deepstream_sdk_v4.0.2_x86_64/samples/models/body_25/pose_iter_584000.caffemodel";
const std::string saveEngine = "/home/ubuntu/deepstream_sdk_v4.0.2_x86_64/samples/models/body_25/pose.engine";
const std::string img_name = "/home/ubuntu/deepstream_sdk_v4.0.2_x86_64/test.jpg";
const int maxBatchSize = 1;
const int run_mode = 1;
std::vector<std::string> outputBlobname{"net_output"};
std::vector<std::vector<float>> calibratorData;
OpenPose openpose1;
cudaEngine = openpose1.createEngine1(prototxt,
caffemodel,
saveEngine,
outputBlobname,
calibratorData,
maxBatchSize,
run_mode);
if (cudaEngine == nullptr)
{
std::cerr << "Failed to build cuda engine on " << std::endl;
return false;
}
printf("GO IN OPENPOSE create engine success\n");
return true;
}
OpenPose.hpp 完整代码
/*
* @Description: openpose tensorrt
* @Author: zerollzeng
* @Date: 2019-08-19 11:38:23
* @LastEditTime: 2019-10-16 15:52:21
* @LastEditors: zerollzeng
* @Version: 1.0
*/
#ifndef OPENPOSE_HPP
#define OPENPOSE_HPP
#include <string>
#include <vector>
#include <NvInfer.h>
#include <NvCaffeParser.h>
class Trt;
class OpenPose{
public:
/**
* @prototxt: NOTE: set input height and width in prototxt,
* @calibratorData: create an empty instance, not support int8 now.
* @maxBatchSize: set to 1.
*/
OpenPose();
~OpenPose();
nvinfer1::ICudaEngine *createEngine1(const std::string& prototxt,
const std::string& caffeModel,
const std::string& saveEngine,
const std::vector<std::string>& outputBlobName,
const std::vector<std::vector<float>>& calibratorData,
int maxBatchSize,
int runMode);
/**
* @inputData: 1 * 3 * 480 * 640, or your favorite size, make sure modify it in prototxt.
* @result: output keypoint, (x1,y1,score1, x2,y2,score2 ... x25, y25, scrore25) for one person and so on.
*/
void DoInference(std::vector<float>& inputData, std::vector<float>& result);
private:
void MallocExtraMemory();
Trt* mNet;
int mBatchSize;
// input's device memory
void* mpInputGpu;
// input size, count in byte
int64_t mInputSize;
// input datatype
nvinfer1::DataType mInputDataType;
// input dims
nvinfer1::Dims3 mInputDims;
void* mpHeatMapGpu;
float* mpHeatMapCpu;
int64_t mHeatMapSize;
nvinfer1::Dims3 mHeatMapDims;
const float mResizeScale = 4; // resize 8x
void* mpResizeMapGpu;
float* mpResizeMapCpu;
int64_t mResizeMapSize;
nvinfer1::Dims3 mResizeMapDims;
void* mpKernelGpu;
int* mpKernelCpu;
int64_t mKernelSize;
nvinfer1::Dims3 mKernelDims;
void* mpPeaksGpu;
float* mpPeaksCpu;
int64_t mPeaksSize;
const int mNumPeaks = 25;
int mMaxPerson = 128;
const int mPeaksVector = 3;
nvinfer1::Dims3 mPeaksDims;
// nms parameters
const float mThreshold = 0.05f;
const float mNMSoffset = 0.5f;
// body part connect parameters
float mInterMinAboveThreshold = 0.95f;
float mInterThreshold = 0.05f;
int mMinSubsetCnt = 3;
float mMinSubsetScore = 0.4f;
float mScaleFactor = 8.f;
};
#endif
OpenPose.cu 部分代码
OpenPose::OpenPose()
{
;
}
nvinfer1::ICudaEngine *OpenPose::createEngine1(const std::string& prototxt,
const std::string& caffeModel,
const std::string& saveEngine,
const std::vector<std::string>& outputBlobName,
const std::vector<std::vector<float>>& calibratorData,
int maxBatchSize,
int runMode) {
mNet = new Trt();
nvinfer1::ICudaEngine *cudaEngine = mNet->CreateEngine(prototxt, caffeModel, saveEngine, outputBlobName, calibratorData, maxBatchSize, runMode);
MallocExtraMemory();
return cudaEngine;
}
3/2/2、通过CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE实现对推理后张量的解析
我的路径是在\deepstream_sdk_v4.0.2_x86_64\sources\libs\nvdsinfer_customparser\nvdsinfer_custombboxparser.cu
这里是body_25参数
/*
* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <cstring>
#include <iostream>
#include "nvdsinfer_custom_impl.h"
#include "NvInfer.h"
#include "cuda.cuh"
#include "cuda.hpp"
#include "ResizeAndMerge.hpp"
#include "PoseNMS.hpp"
#include "BodyPartConnector.hpp"
#include "Point.hpp"
#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define CLIP(a,min,max) (MAX(MIN(a, max), min))
#define DIVIDE_AND_ROUND_UP(a, b) ((a + b - 1) / b)
/* This is a sample bounding box parsing function for the sample Resnet10
* detector model provided with the SDK. */
#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr) \
{ \
cudaError_t error_code = callstr; \
if (error_code != cudaSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
exit(0); \
} \
}
#endif
inline unsigned int getElementSize(nvinfer1::DataType t)
{
switch (t)
{
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kINT8: return 1;
default: throw std::runtime_error("Invalid DataType.");
}
}
inline void* safeCudaMalloc(size_t memSize) {
void* deviceMem;
CUDA_CHECK(cudaMalloc(&deviceMem, memSize));
if (deviceMem == nullptr) {
std::cerr << "Out of memory" << std::endl;
exit(1);
}
return deviceMem;
}
/* C-linkage to prevent name-mangling */
extern "C"
bool NvDsInferParseCustomResnet (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList);
extern "C"
bool NvDsInferParseCustomResnet (std::vector<NvDsInferLayerInfo> const &outputLayersInfo,
NvDsInferNetworkInfo const &networkInfo,
NvDsInferParseDetectionParams const &detectionParams,
std::vector<NvDsInferObjectDetectionInfo> &objectList)
{
printf("OK IN pgie_pad_buffer_probe !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
int mBatchSize = 1;
const float mResizeScale = 4;
const int mNumPeaks = 25;
int mMaxPerson = 128;
const int mPeaksVector = 3;
std::vector<float> result;
void* mpHeatMapGpu = safeCudaMalloc(1497600);
// nms parameters
const float mThreshold = 0.05f;
const float mNMSoffset = 0.5f;
// body part connect parameters
float mInterMinAboveThreshold = 0.95f;
float mInterThreshold = 0.05f;
int mMinSubsetCnt = 3;
float mMinSubsetScore = 0.4f;
float mScaleFactor = 8.f;
// malloc resieze memory
nvinfer1::Dims3 mResizeMapDims = nvinfer1::Dims3(78, int(60 * mResizeScale), int(80 * mResizeScale));
int64_t mResizeMapSize = mBatchSize * mResizeMapDims.d[0] * mResizeMapDims.d[1] * mResizeMapDims.d[2] * 4;
void* mpResizeMapGpu = safeCudaMalloc(mResizeMapSize);
float* mpResizeMapCpu = new float[mResizeMapSize / 4];
// malloc kernel memory
int64_t mKernelSize = mBatchSize * 78 * 240 * 320 * sizeof(int);
void* mpKernelGpu = safeCudaMalloc(mKernelSize);
int* mpKernelCpu = new int[mBatchSize * 78 * 240 * 320];
// malloc peaks memory
nvinfer1::Dims3 mPeaksDims = nvinfer1::Dims3(mNumPeaks, mMaxPerson, mPeaksVector);
int64_t mPeaksSize = mPeaksDims.d[0] * mPeaksDims.d[1] * mPeaksDims.d[2] * 4;
void* mpPeaksGpu = safeCudaMalloc(mPeaksSize);
float* mpPeaksCpu = new float[mPeaksDims.d[0] * mPeaksDims.d[1] * mPeaksDims.d[2]];
std::vector<float> net_output;
net_output.resize(78 * 60 * 80);
/* Find the output layer */
static int outputLayerIndex = -1;
if (outputLayerIndex == -1) {
for (unsigned int i = 0; i < outputLayersInfo.size(); i++) {
if (strcmp(outputLayersInfo[i].layerName, "net_output") == 0) {
outputLayerIndex = i;
break;
}
}
if (outputLayerIndex == -1) {
std::cerr << "Could not find net_output layer buffer while parsing" << std::endl;
return false;
}
}
memcpy((void*)outputLayersInfo[outputLayerIndex].buffer, (void*)(net_output.data()), 78 * 60 * 80 * 4);
int widthSouce = 80;
int heightSource = 60;
int widthTarget = 320;
int heightTarget = 240;
const dim3 threadsPerBlock{ 16, 16, 1 };
const dim3 numBlocks{
op::getNumberCudaBlocks(widthTarget, threadsPerBlock.x),
op::getNumberCudaBlocks(heightTarget, threadsPerBlock.y),
op::getNumberCudaBlocks(78, threadsPerBlock.z) };
op::resizeKernel << <numBlocks, threadsPerBlock >> > ((float*)mpResizeMapGpu, (float*)mpHeatMapGpu, widthSouce, heightSource, widthTarget, heightTarget);
CUDA_CHECK(cudaMemcpy(mpResizeMapCpu, mpResizeMapGpu, mResizeMapSize, cudaMemcpyDeviceToHost));
// pose nms
std::array<int, 4> targetSize2{ mBatchSize,mNumPeaks,mMaxPerson,mPeaksVector };
std::array<int, 4> sourceSize2{ mBatchSize,78, 240, 320 };
op::Point<float> offset = op::Point<float>(0.5, 0.5);
op::nmsGpu((float*)mpPeaksGpu, (int*)mpKernelGpu, (float*)mpResizeMapGpu, mThreshold, targetSize2, sourceSize2, offset);
CUDA_CHECK(cudaMemcpyAsync(mpPeaksCpu, mpPeaksGpu, mPeaksSize, cudaMemcpyDeviceToHost, 0));
// bodypart connect
Array<float> poseKeypoints;
Array<float> poseScores;
op::Point<int> resizeMapSize = op::Point<int>(320, 240);
op::connectBodyPartsCpu(poseKeypoints, poseScores, mpResizeMapCpu, mpPeaksCpu, op::PoseModel::BODY_25, resizeMapSize, mMaxPerson, mInterMinAboveThreshold, mInterThreshold,
mMinSubsetCnt, mMinSubsetScore, 1.f);
result.resize(poseKeypoints.getVolume());
// std::cout << "number of person: " << poseKeypoints.getVolume()/75 << std::endl;
for (int i = 0; i < poseKeypoints.getVolume(); i++) {
if ((i + 1) % 3 == 0) {
result[i] = poseKeypoints[i];
}
else {
result[i] = poseKeypoints[i] * (8 / mResizeScale);
}
}
printf("OK OUT pgie_pad_buffer_probe !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
return true;
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomResnet);
包含目录
dstensor_pgie_config.txt需要配置链接:
################################################################################
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
################################################################################
# Following properties are mandatory when engine files are not specified:
# int8-calib-file(Only in INT8)
# Caffemodel mandatory properties: model-file, proto-file, output-blob-names
# UFF: uff-file, input-dims, uff-input-blob-name, output-blob-names
# ONNX: onnx-file
#
# Mandatory properties for detectors:
# num-detected-classes
#
# Optional properties for detectors:
# enable-dbscan(Default=false), interval(Primary mode only, Default=0)
# custom-lib-path,
# parse-bbox-func-name
#
# Mandatory properties for classifiers:
# classifier-threshold, network-type (Default=0 i.e. Detector)
#
# Optional properties for classifiers:
# classifier-async-mode(Secondary mode only, Default=false)
#
# Optional properties in secondary mode:
# operate-on-gie-id(Default=0), operate-on-class-ids(Defaults to all classes),
# input-object-min-width, input-object-min-height, input-object-max-width,
# input-object-max-height
#
# Following properties are always recommended:
# batch-size(Default=1)
#
# Other optional properties:
# net-scale-factor(Default=1), network-mode(Default=0 i.e FP32),
# model-color-format(Default=0 i.e. RGB) model-engine-file, labelfile-path,
# mean-file, gie-unique-id(Default=0), offsets, gie-mode (Default=1 i.e. primary),
# custom-lib-path, network-mode(Default=0 i.e FP32)
#
# The values in the config file are overridden by values set through GObject
# properties.
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-file=../../../../samples/models/body_25/pose_iter_584000.caffemodel
proto-file=../../../../samples/models/body_25/pose_deploy.prototxt
model-engine-file=../../../../samples/models/body_25/pose3.engine
batch-size=1
network-mode=1
process-mode=1
model-color-format=0
num-detected-classes=4
interval=0
gie-unique-id=1
output-blob-names=net_output
## 0=Detector, 1=Classifier, 2=Segmentation, 100=Other
network-type=100
parse-bbox-func-name=NvDsInferParseCustomResnet
custom-lib-path=/home/ubuntu/objectDetector_OpenPose/lib/libtestopenpose.so
#custom-lib-path=/home/ubuntu/zhai/tensorrt-zoo/lib/libtinytrt.so
#Enable tensor metadata output
output-tensor-meta=1
[class-attrs-all]
threshold=0.2
eps=0.2
group-threshold=1