struck 利用結構化SVM來實作視覺跟蹤,在深度學習流行起來之前,struck是視覺跟蹤領域效果最好的方法。深度學習流行之後,利用泛化的卷積特征能夠得到很好的效果。struck的優點在于,它可以使用任意的特征來實作跟蹤,是以它可以利用卷積神經網絡提取的特征,然後結合結構化SVM來實作視覺跟蹤,這樣的效果說不定更好。
struck的源碼是C++實作的,作者寫的很好,思路清晰,代碼結構清晰,而且與論文中的相符,沒有那麼多小trick,結果比較可靠。
下面從它的主函數開始,分析這份源碼是如何實作的:
main.cpp
/*
* Struck: Structured Output Tracking with Kernels
*
* Code to accompany the paper:
* Struck: Structured Output Tracking with Kernels
* Sam Hare, Amir Saffari, Philip H. S. Torr
* International Conference on Computer Vision (ICCV), 2011
*
* Copyright (C) 2011 Sam Hare, Oxford Brookes University, Oxford, UK
*
* This file is part of Struck.
*
* Struck is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Struck is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Struck. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "Tracker.h"
#include "Config.h"
#include <iostream>
#include <fstream>
#include <opencv/cv.h>
#include <opencv/highgui.h>
using namespace std;
using namespace cv;
static const int kLiveBoxWidth = 80;
static const int kLiveBoxHeight = 80;
void rectangle(Mat& rMat, const FloatRect& rRect, const Scalar& rColour)
{
IntRect r(rRect);
rectangle(rMat, Point(r.XMin(), r.YMin()), Point(r.XMax(), r.YMax()), rColour);
}
int main(int argc, char* argv[])
{
//這幾句話沒啥作用,我給注釋掉
#ifndef WIN32
string programName = argv[0];
programName = programName.substr(programName.find_first_of('/'));
cout << "programName: " << programName << endl;
#endif
// read config file
string configPath = "../docs/config.txt";
Config conf(configPath);//作者定義的類Config 讀取了所有的配置資訊,并且cout輸出
cout << conf << endl;
if (conf.features.size() == 0)
{
cout << "error: no features specified in config" << endl;
return EXIT_FAILURE;
}
if (argc > 1)
{
conf.sequenceName = argv[1];
}
ofstream outFile;//定義一個輸出檔案流,輸出結果
if (conf.resultsPath != "")
{
#ifdef WIN32
string resultsPath = conf.resultsPath + "/" + conf.sequenceName + "_result.txt";
#else
string resultsPath = conf.resultsPath + "/" + conf.sequenceName + "_" + programName + "Result.txt";
#endif
outFile.open(resultsPath, ios::out);
if (!outFile)
{
cout << "error: could not open results file: " << conf.resultsPath << endl;
return EXIT_FAILURE;
}
}
// if no sequence specified then use the camera
bool useCamera = (conf.sequenceName == "");//根據在config.txt中是否給出視訊名稱,判斷是否使用攝像頭
VideoCapture cap;
int startFrame = -1;
int endFrame = -1;
FloatRect initBB;//這是一個模闆類,
string imgFormat;
float scaleW = 1.f;
float scaleH = 1.f;
if (useCamera)//使用攝像頭
{
if (!cap.open(0))
{
cout << "error: could not start camera capture" << endl;
return EXIT_FAILURE;
}
startFrame = 0;
endFrame = INT_MAX;
Mat tmp;
cap >> tmp;//讀入一幀視訊
scaleW = (float)conf.frameWidth/tmp.cols;//config中寬/讀入視訊的寬,比率
scaleH = (float)conf.frameHeight/tmp.rows;
/*該函數,創造了一個矩形,左上角在(120,80),80*80的矩形*/
initBB = IntRect(conf.frameWidth/2-kLiveBoxWidth/2, conf.frameHeight/2-kLiveBoxHeight/2, kLiveBoxWidth, kLiveBoxHeight);
cout << "press 'i' to initialise tracker" << endl;
}
else//使用視訊
{
// parse frames file
string framesFilePath = conf.sequenceBasePath+"/"+conf.sequenceName+"/"+"frames.txt";
ifstream framesFile(framesFilePath.c_str(), ios::in);
if (!framesFile)
{
cout << "error: could not open sequence frames file: " << framesFilePath << endl;
return EXIT_FAILURE;
}
string framesLine;
getline(framesFile, framesLine);
printf("%s", framesLine.c_str());
sscanf(framesLine.c_str(), "%d,%d", &startFrame, &endFrame);
if (framesFile.fail() || startFrame == -1 || endFrame == -1)
{
cout << "error: could not parse sequence frames file" << endl;
return EXIT_FAILURE;
}
imgFormat = conf.sequenceBasePath+"/"+conf.sequenceName+"/img/%04d.jpg";//qyy changed
// read first frame to get size
char imgPath[256];
sprintf(imgPath, imgFormat.c_str(), startFrame);
Mat tmp = cv::imread(imgPath, 0);
scaleW = (float)conf.frameWidth/tmp.cols;
scaleH = (float)conf.frameHeight/tmp.rows;
// read init box from ground truth file
string gtFilePath = conf.sequenceBasePath+"/"+conf.sequenceName+"/"+"groundtruth_rect.txt";//qyy changed
ifstream gtFile(gtFilePath.c_str(), ios::in);
if (!gtFile)
{
cout << "error: could not open sequence gt file: " << gtFilePath << endl;
return EXIT_FAILURE;
}
string gtLine;
getline(gtFile, gtLine);
float xmin = -1.f;
float ymin = -1.f;
float width = -1.f;
float height = -1.f;
sscanf(gtLine.c_str(), "%f,%f,%f,%f", &xmin, &ymin, &width, &height);
if (gtFile.fail() || xmin < 0.f || ymin < 0.f || width < 0.f || height < 0.f)
{
cout << "error: could not parse sequence gt file" << endl;
return EXIT_FAILURE;
}
initBB = FloatRect(xmin*scaleW, ymin*scaleH, width*scaleW, height*scaleH);
}
Tracker tracker(conf);//使用conf類,初始化Tracker類
if (!conf.quietMode)//quietMode模式下,不顯示結果,隻運算
{
namedWindow("result");
}
Mat result(conf.frameHeight, conf.frameWidth, CV_8UC3);
bool paused = false;
bool doInitialise = false;
srand(conf.seed);
for (int frameInd = startFrame; frameInd <= endFrame; ++frameInd)
{
cout << "frame num is: " << frameInd << endl;//qyy
Mat frame;
if (useCamera)
{
Mat frameOrig;
cap >> frameOrig;
resize(frameOrig, frame, Size(conf.frameWidth, conf.frameHeight));
//imshow("result",frame);//qyy
//waitKey(0);//qyy
flip(frame, frame, 1);//作者把視訊左右對稱翻轉了,不知道為什麼這麼做?
//imshow("result", frame);//qyy
//waitKey(0);//qyy
frame.copyTo(result);
if (doInitialise)
{
if (tracker.IsInitialised())
{
tracker.Reset();
}
else
{
tracker.Initialise(frame, initBB);
}
doInitialise = false;
}
else if (!tracker.IsInitialised())
{
rectangle(result, initBB, CV_RGB(255, 255, 255));//沒有初始化,就在result上畫白色框框
}
}
else
{
char imgPath[256];
sprintf(imgPath, imgFormat.c_str(), frameInd);
Mat frameOrig = cv::imread(imgPath, 0);//第二個參數flag指定讀取的顔色類型,=0表示讀取為灰階圖像
cout << "frameOrig.channels: " << frameOrig.channels() << endl;//qyy
if (frameOrig.empty())
{
cout << "error: could not read frame: " << imgPath << endl;
return EXIT_FAILURE;
}
resize(frameOrig, frame, Size(conf.frameWidth, conf.frameHeight));
cvtColor(frame, result, CV_GRAY2RGB);//作者讀進來的時候是灰階圖像,為了顯示轉換成3通道都是灰階圖
if (frameInd == startFrame)//如果是第一幀,初始化
{
tracker.Initialise(frame, initBB);
}
}
if (tracker.IsInitialised())//如果初始化了,就開始跟蹤
{
tracker.Track(frame);//跟蹤程式,把tracker當做一個類來對待,很清晰明了啊,贊一個;算法都在這裡面實作
if (!conf.quietMode && conf.debugMode)
{
tracker.Debug();//debug模式下,可以開啟很多額外的視窗顯示
}
rectangle(result, tracker.GetBB(), CV_RGB(0, 255, 0));//使用綠色框,畫出跟蹤的效果
if (outFile)//這裡是得到的矩形框,存儲到txt文本中
{
const FloatRect& bb = tracker.GetBB();
outFile << bb.XMin() / scaleW << "," << bb.YMin() / scaleH << "," << bb.Width() / scaleW << "," << bb.Height() / scaleH << flush << endl;
cout << "cout to file: " << bb.XMin() / scaleW << "," << bb.YMin() / scaleH << "," << bb.Width() / scaleW << "," << bb.Height() / scaleH << endl;
}
}
if (!conf.quietMode)//如果使用的是攝像頭,作者提供了幾個按鍵來選擇是否初始化,我用的是OTB資料集,就不管這個了
{
imshow("result", result);
int key = waitKey(paused ? 0 : 1);
if (key != -1)
{
if (key == 27 || key == 113) // esc q
{
break;
}
else if (key == 112) // p
{
paused = !paused;
}
else if (key == 105 && useCamera)//i
{
doInitialise = true;
cout << "initialised !" << endl;//qyy
}
}
if (conf.debugMode && frameInd == endFrame)
{
cout << "\n\nend of sequence, press any key to exit" << endl;
//waitKey();
}
}
}
if (outFile.is_open())
{
outFile.close();
}
return EXIT_SUCCESS;
}
是以,後面我主要關注tracker這個類做了什麼,我們看到在main.cpp中調用了tracker.Initialize Debug Track這幾個成員函數,是以這幾個函數是作者算法實作的關鍵。