手部關鍵點檢測是在手指上找到關節以及在給定圖像中找到指尖的過程。它類似于在臉部(面部關鍵點檢測)或身體(人體姿勢估計)上找到關鍵點。但是手部檢測不同的地方在于,我們将整個手部視為一個對象。
美國卡耐基梅隆大學智能感覺實驗室(CMU Perceptual Computing Lab)釋出了手的關鍵點檢測模型。詳情見:
我們将在本文介紹如何調用該模型。
1 背景
上圖出自上面說的論文
他們從一小組标記的手部圖像開始,并使用神經網絡(卷積姿勢分析機
)來粗略估計手部關鍵點。他們設定了一個多視圖系統可以從31個高清攝像頭擷取來自不同視點或角度的圖像。
他們将這些圖像傳遞通過檢測器,以獲得許多粗略的關鍵點預測。一旦從不同視圖獲得同一手的檢測到的關鍵點,就會執行關鍵點三角測量以獲得關鍵點的3D位置。關鍵點的3D位置用于通過從3D到2D的重投影來穩健地預測關鍵點。這對于難以預測關鍵點的圖像尤其重要。通過這種方式,他們可以在幾次疊代中獲得更好的檢測器。
總之,他們使用關鍵點檢測器和多視圖圖像來提出改進的檢測器。改進的主要來源是标記的圖像集的多視圖圖像。
該模型産生22個關鍵點。手有21個關鍵點(0到20号關鍵點),而第22個關鍵點代表背景。關鍵點位置如下圖所示:
2 實作
從此連結下載下傳該模型:
這是一個caffe模型。
模型讀取預測代碼和其他caffe模型一樣,如下所示:
//模型檔案位置
string protoFile = "./model/pose_deploy.prototxt";
string weightsFile = "./model/pose_iter_102000.caffemodel";
// read image 讀取圖像
string imageFile = "./image/hand.jpg";
Mat frame = imread(imageFile);
if (frame.empty())
{
cout << "check image" << endl;
return 0;
}
//複制圖像
Mat frameCopy = frame.clone();
//讀取圖像長寬
int frameWidth = frame.cols;
int frameHeight = frame.rows;
float thresh = 0.01;
//原圖寬高比
float aspect_ratio = frameWidth / (float)frameHeight;
int inHeight = 368;
//縮放圖像
int inWidth = (int(aspect_ratio*inHeight) * 8) / 8;
cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl;
double t = (double)cv::getTickCount();
//調用caffe模型
Net net = readNetFromCaffe(protoFile, weightsFile);
Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false);
net.setInput(inpBlob);
Mat output = net.forward();
int H = output.size[2];
int W = output.size[3];
輸出有22個矩陣,每個矩陣是關鍵點的機率圖。為了找到确切的關鍵點,首先,我們将機率圖縮放到原始圖像的大小。然後通過查找機率圖的最大值來找到關鍵點的位置。這是使用OpenCV中的minmaxLoc函數完成的。我們繪制檢測到的點以及圖像上的編号。我們将使用檢測到的點來擷取關鍵點形成的骨架并将其繪制在圖像上。畫骨架代碼如下:
// find the position of the body parts 找到各點的位置
vector<Point> points(nPoints);
for (int n = 0; n < nPoints; n++)
{
// Probability map of corresponding body's part. 第一個特征點的預測矩陣
Mat probMap(H, W, CV_32F, output.ptr(0, n));
//放大預測矩陣
resize(probMap, probMap, Size(frameWidth, frameHeight));
Point maxLoc;
double prob;
//尋找預測矩陣,最大值機率以及最大值的坐标位置
minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
if (prob > thresh)
{
//畫圖
circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0, 255, 255), -1);
cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
}
//儲存特征點的坐标
points[n] = maxLoc;
}
//擷取要畫的骨架線個數
int nPairs = sizeof(POSE_PAIRS) / sizeof(POSE_PAIRS[0]);
//連接配接點,畫骨架
for (int n = 0; n < nPairs; n++)
{
// lookup 2 connected body/hand parts
Point2f partA = points[POSE_PAIRS[n][0]];
Point2f partB = points[POSE_PAIRS[n][1]];
if (partA.x <= 0 || partA.y <= 0 || partB.x <= 0 || partB.y <= 0)
continue;
//畫骨條線
line(frame, partA, partB, Scalar(0, 255, 255), 8);
circle(frame, partA, 8, Scalar(0, 0, 255), -1);
circle(frame, partB, 8, Scalar(0, 0, 255), -1);
}
結果如下:
3. 結果和代碼
需要注意的一點是,檢測器需要手周圍的邊界框來預測關鍵點。是以,為了獲得更好的效果,手應靠近相機,反正總而言之手的位置要清楚,在螢幕中央。現在的深度學習隻能這樣。精度不怎麼高,隻能在特定場合下使用,就是先确定關鍵點,然後訓練模型,基于統計進行檢測。
代碼見:
C++代碼:
// HandPoints_detection.cpp : 此檔案包含 "main" 函數。程式執行将在此處開始并結束。
//
#include "pch.h"
#include <iostream>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
using namespace cv::dnn;
//各個部位連接配接線坐标,比如(0,1)表示第0特征點和第1特征點連接配接線為拇指
const int POSE_PAIRS[20][2] =
{
{0,1}, {1,2}, {2,3}, {3,4}, // thumb
{0,5}, {5,6}, {6,7}, {7,8}, // index
{0,9}, {9,10}, {10,11}, {11,12}, // middle
{0,13}, {13,14}, {14,15}, {15,16}, // ring
{0,17}, {17,18}, {18,19}, {19,20} // small
};
int nPoints = 22;
int main()
{
//模型檔案位置
string protoFile = "./model/pose_deploy.prototxt";
string weightsFile = "./model/pose_iter_102000.caffemodel";
// read image 讀取圖像
string imageFile = "./image/hand.jpg";
Mat frame = imread(imageFile);
if (frame.empty())
{
cout << "check image" << endl;
return 0;
}
//複制圖像
Mat frameCopy = frame.clone();
//讀取圖像長寬
int frameWidth = frame.cols;
int frameHeight = frame.rows;
float thresh = 0.01;
//原圖寬高比
float aspect_ratio = frameWidth / (float)frameHeight;
int inHeight = 368;
//縮放圖像
int inWidth = (int(aspect_ratio*inHeight) * 8) / 8;
cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl;
double t = (double)cv::getTickCount();
//調用caffe模型
Net net = readNetFromCaffe(protoFile, weightsFile);
Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false);
net.setInput(inpBlob);
Mat output = net.forward();
int H = output.size[2];
int W = output.size[3];
// find the position of the body parts 找到各點的位置
vector<Point> points(nPoints);
for (int n = 0; n < nPoints; n++)
{
// Probability map of corresponding body's part. 第一個特征點的預測矩陣
Mat probMap(H, W, CV_32F, output.ptr(0, n));
//放大預測矩陣
resize(probMap, probMap, Size(frameWidth, frameHeight));
Point maxLoc;
double prob;
//尋找預測矩陣,最大值機率以及最大值的坐标位置
minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
if (prob > thresh)
{
//畫圖
circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0, 255, 255), -1);
cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
}
//儲存特征點的坐标
points[n] = maxLoc;
}
//擷取要畫的骨架線個數
int nPairs = sizeof(POSE_PAIRS) / sizeof(POSE_PAIRS[0]);
//連接配接點,畫骨架
for (int n = 0; n < nPairs; n++)
{
// lookup 2 connected body/hand parts
Point2f partA = points[POSE_PAIRS[n][0]];
Point2f partB = points[POSE_PAIRS[n][1]];
if (partA.x <= 0 || partA.y <= 0 || partB.x <= 0 || partB.y <= 0)
continue;
//畫骨條線
line(frame, partA, partB, Scalar(0, 255, 255), 8);
circle(frame, partA, 8, Scalar(0, 0, 255), -1);
circle(frame, partB, 8, Scalar(0, 0, 255), -1);
}
//計算運作時間
t = ((double)cv::getTickCount() - t) / cv::getTickFrequency();
cout << "Time Taken = " << t << endl;
imshow("Output-Keypoints", frameCopy);
imshow("Output-Skeleton", frame);
imwrite("Output-Skeleton.jpg", frame);
waitKey();
return 0;
}
from __future__ import division
import cv2
import time
import numpy as np
protoFile = "./model/pose_deploy.prototxt"
weightsFile = "./model/pose_iter_102000.caffemodel"
nPoints = 22
POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]
net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
frame = cv2.imread("./image/hand.jpg")
frameCopy = np.copy(frame)
frameWidth = frame.shape[1]
frameHeight = frame.shape[0]
aspect_ratio = frameWidth/frameHeight
threshold = 0.1
t = time.time()
# input image dimensions for the network
inHeight = 368
inWidth = int(((aspect_ratio*inHeight)*8)//8)
inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight), (0, 0, 0), swapRB=False, crop=False)
net.setInput(inpBlob)
output = net.forward()
print("time taken by network : {:.3f}".format(time.time() - t))
# Empty list to store the detected keypoints
points = []
for i in range(nPoints):
# confidence map of corresponding body's part.
probMap = output[0, i, :, :]
probMap = cv2.resize(probMap, (frameWidth, frameHeight))
# Find global maxima of the probMap.
minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)
if prob > threshold :
cv2.circle(frameCopy, (int(point[0]), int(point[1])), 8, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, lineType=cv2.LINE_AA)
# Add the point to the list if the probability is greater than the threshold
points.append((int(point[0]), int(point[1])))
else :
points.append(None)
# Draw Skeleton
for pair in POSE_PAIRS:
partA = pair[0]
partB = pair[1]
if points[partA] and points[partB]:
cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2)
cv2.circle(frame, points[partA], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
cv2.circle(frame, points[partB], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
cv2.imshow('Output-Keypoints', frameCopy)
cv2.imshow('Output-Skeleton', frame)
cv2.imwrite('Output-Keypoints.jpg', frameCopy)
cv2.imwrite('Output-Skeleton.jpg', frame)
print("Total time taken : {:.3f}".format(time.time() - t))
cv2.waitKey(0)