天天看点

在android开发板上测试neon加速实验

今天为了测试在mtk6757上面的neon加速性能,从网上找来一个测试例子保留了它的汇编文件和cpp文件,做了一些简单的修改,然后自己写了一个Android.mk和Application.mk

采用ndk来进行编译。

文件提取链接  http://pan.baidu.com/s/1o8KF8tO

下面贴出相关文件的结构和代码

[email protected]:~$ tree work >tree.log

[email protected]:~$ cat tree.log 

work

└── jni

    ├── Android.mk

    ├── Application.mk

    ├── cost_cal.s

    ├── cost_final.s

    ├── cost_init.s

    ├── l_cost.s

    ├── sgbm.cpp

    ├── test.cpp

    └── x_cost.s

1 directory, 9 files

[email protected]:~$

[email protected]:~/work/jni$ cat Android.mk

LOCAL_PATH:= $(call my-dir)  

include $(CLEAR_VARS)  

NDK_APP_DST_DIR := $(LOCAL_PATH)

include /home/archermind/OpenCV-android-sdk/sdk/native/jni/OpenCV.mk 

LOCAL_SRC_FILES := cost_cal.s  cost_final.s  cost_init.s  l_cost.s  x_cost.s test.cpp sgbm.cpp

LOCAL_CFLAGS := -D__cpusplus -g -mfloat-abi=soft -mfpu=neon -march=armv7-a -mtune=cortex-a53

TARGET_ARCH_ABI :=armeabi-v7a 

LOCAL_ARM_MODE := arm

ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)

LOCAL_ARM_NEON := true 

endif   

LOCAL_MODULE := test

include $(BUILD_EXECUTABLE)

[email protected]:~/work/jni$ 

[email protected]:~/work/jni$ cat Application.mk

APP_STL := gnustl_static

APP_CPPFLAGS := -frtti -fexceptions -std=c++11 

APP_ABI := armeabi-v7a

APP_PLATFORM := android-19

[email protected]:~/work/jni$ 

[email protected]:~/work/jni$ cat cost_cal.s

.arm

.text

.global cost_cal

cost_cal:

push {r4-r6}

ldr r12,[sp,#12]@hsumSub

ldr r4,[sp,#16]@C

ldr r5,[sp,#20]@Count

ldr r6,[sp,#24]@hsumAdd[x+d]

lsr r5,r5,#3@Count/8

.loop:

vld1.16 {q0},[r0]! @pixAdd[d]

vld1.16 {q1},[r1]! @pixSub[d]

vld1.16 {q2},[r2]! @hsumAdd[x-D+d]

vqadd.s16 q3,q0,q2

vqsub.s16 q3,q3,q1@hsumAdd[x+d]

vst1.16 {q3},[r6]! @hsumAdd[x+d]

vld1.16 {q8},[r3]! @Cprev[x+d]

vld1.16 {q9},[r12]! @hsumSub[x+d]

vqadd.s16 q3,q3,q8

vqsub.s16 q3,q3,q9

vst1.16 {q3},[r4]!

subs r5,r5,#1

bne .loop

pop           {r4-r6}

bx            lr

.end

[email protected]:~/work/jni$ 

[email protected]:~/work/jni$ cat cost_final.s

.arm

.text

.global cost_final

cost_final:

vdup.8 q0,r0 @u

lsr r3,r3,#4@D/16

.loop:

vld1.8 {q1},[r1]! @v

vabd.s8 q2,q0,q1 @abs(u-v)

vmovl.s8 q3,d4

vmovl.s8 q8,d5

vld1.16 {q1},[r2]

vqadd.s16 q1,q1,q3

vst1.16 {q1},[r2]!

vld1.16 {q2},[r2]

vqadd.s16 q2,q2,q8

vst1.16 {q2},[r2]!

subs r3,r3,#1

bne .loop

bx            lr

.end

[email protected]:~/work/jni$ cat cost_init.s

.arm

.text

.global cost_init

cost_init:

push {r4-r7}

ldr r12,[sp,#16]@v

ldr r4,[sp,#20]@v0

ldr r5,[sp,#24]@v1

ldr r6,[sp,#28]@cost

ldr r7,[sp,#32]@D

lsr r7,r7,#4 

vdup.8 q0,r0 @u

vdup.8 q1,r1 @u0

vdup.8 q2,r2 @u1

vdup.16 q12,r3 @(-1)*diff_scale

.loop:

vld1.8 {q3},[r12]! @v

vld1.8 {q8},[r4]! @v0

vld1.8 {q9},[r5]! @v1

vqsub.s8 q9,q0,q9 @u-v1

vqsub.s8 q8,q8,q0 @v0-u

vmax.s8 q8,q8,q9 @c0=max(u-v1,v0-u)

vqsub.s8 q10,q3,q2 @v-u1

vqsub.s8 q11,q1,q3 @u0-v

vmax.s8 q10,q10,q11 @c1=max(v-u1,u0-v)

vmin.s8 q3,q8,q10 @min(c0,c1)

vmovl.s8 q10,d6

vmovl.s8 q11,d7

vshl.s16 q10,q10,q12

vshl.s16 q11,q11,q12

vld1.16 {q3},[r6]

vqadd.s16 q10,q10,q3

vst1.16 {q10},[r6]!

vld1.16 {q3},[r6]

vqadd.s16 q11,q11,q3

vst1.16 {q11},[r6]!

subs r7,r7,#1

bne .loop

pop           {r4-r7}

bx            lr

.end

[email protected]:~/work/jni$ cat l_cost.s

    .arm

.text

.global l_cost

l_cost:

        push     {r4-r9}

ldr r12,[sp,#24]  @lr_p0

ldr     r4,[sp,#28]   @lr_p1

ldr     r5,[sp,#32]   @Cp

ldr     r6,[sp,#36]   @Sp

ldr     r7,[sp,#40]   @minLr[0][xm]

ldr r8,[sp,#44] @D

vpush           {q4}

vdup.16       q0,r0       @delta0

vdup.16       q1,r1      @delta1

vdup.16       q2,r2      @delta2

vdup.16       q3,r3      @delta3

mov           r0,#200     

vdup.16       q4,r0       @p1

mov         r0,#255

lsl    r0,r0,#7

add         r0,r0,#127

vdup.16 q8,r0 @minL0

vdup.16 q9,r0 @minL1

vdup.16 q10,r0 @minL2

vdup.16 q11,r0 @minL3

add r1,r8,#16@D2

mov r9,#22

mul r9,r9,r1

sub r9,r9,#16@22*D2-16

lsl r2,r1,#3@NRD2

sub r3,r2,#1

lsl r3,r3,#1@2*(NRD2-1)

lsl r1,r1,#1@2*D2

add r0,r1,r3@2*(NRD2-1+D2)

lsl r2,r0,#1@4*(NRD2-1+D2)

sub r2,r2,#10@4*NRD2+4*D2-14

lsr r8,r8,#3@D/8

.loop:

vld1.16       {q12},[r5]!    @Cp[d]

vld1.16 {q13},[r6] @Sp[d] 

vld1.16   {q14},[r12]   @lr_p0[d]

sub          r12,r12,#2     

vld1.16      {q15},[r12]   @lr_p0[d-1]

vqadd.s16     q15,q15,q4   @lr_p0[d-1]+p1

vmin.s16 q14,q14,q15 @min(lr_p0[d],lr_p0[d-1]+p1)

add          r12,r12,#4    

vld1.16      {q15},[r12]  @lr_p0[d+1]

vqadd.s16 q15,q15,q4 @lr_p0[d+1]+p1

vmin.s16 q14,q14,q15 @min(lr_p0[d],lr_p0[d-1]+p1,lr_p0[d+1],p1)

vmin.s16      q14,q14,q0

vqsub.s16     q14,q14,q0

vqadd.s16     q14,q14,q12  @L0

add r12,r12,r3@Lr_p[d]

vst1.16 {q14},[r12] @Lr_p[d]

vmin.s16 q8,q8,q14 @min(minL0,L0)

vqadd.s16 q13,q13,q14 @Sp[d]+L0

vld1.16       {q14},[r4] @lr_p1[d]

sub           r4,r4,#2  

vld1.16       {q15},[r4]  @lr_p1[d-1]

vqadd.s16  q15,q15,q4 @lr_p1[d-1]+p1

vmin.s16 q14,q14,q15 @min(lr_p1[d],lr_p1[d-1]+p1)

add           r4,r4,#4

vld1.16       {q15},[r4]  @lr_p1[d+1]

vqadd.s16     q15,q15,q4   @lr_p1[d+1]+p1

vmin.s16      q14,q14,q15 @min(lr_p1[d],lr_p1[d-1]+p1,lr_p1[d+1]+p1)

vmin.s16      q14,q14,q1 

vqsub.s16     q14,q14,q1

vqadd.s16     q14,q14,q12 @L1

add r12,r12,r1

vst1.16 {q14},[r12] @Lr_p[d+D2]

vmin.s16 q9,q9,q14 @min(minL1,L1)

vqadd.s16 q13,q13,q14 @Sp[d]+L0+L1

add r4,r4,r0  

vld1.16       {q14},[r4] @lr_p2[d]

sub           r4,r4,#2

vld1.16       {q15},[r4]  @lr_p2[d-1]

vqadd.s16 q15,q15,q4 @lr_p2[d-1]+p1

vmin.s16 q14,q14,q15 @min(lr_p2[d],lr_p2[d-1]+p1)

add           r4,r4,#4

vld1.16       {q15},[r4]  @lr_p2[d+1]

vqadd.s16     q15,q15,q4   @lr_p2[d+1]+p1

vmin.s16      q14,q14,q15  @min(lr_p2[d],lr_p2[d-1]+p1,lr_p2[d+1]+p1)

vmin.s16      q14,q14,q2

vqsub.s16     q14,q14,q2

vqadd.s16     q14,q14,q12 @L2

add r12,r12,r1

vst1.s16 {q14},[r12] @Lr_p[d+2*D2]

vmin.s16 q10,q10,q14 @min(minL2,L2)

vqadd.s16 q13,q13,q14 @Sp[d]+L0+L1+L2

add r4,r4,r0 

vld1.16       {q14},[r4] @lr_p3[d]

sub           r4,r4,#2

vld1.16       {q15},[r4]  @lr_p3[d-1]

vqadd.s16 q15,q15,q4 @lr_p3[d-1]+p1

vmin.s16 q14,q14,q15 @min(lr_p3[d],lr_p3[d-1]+p1)

add           r4,r4,#4

vld1.16       {q15},[r4]  @lr_p3[d+1]

vqadd.s16     q15,q15,q4   @lr_p3[d+1]+p1

vmin.s16      q14,q14,q15  @min(lr_p3[d],lr_p3[d-1]+p1,lr_p3[d+1]+p1)

vmin.s16      q14,q14,q3

vqsub.s16     q14,q14,q3

vqadd.s16     q14,q14,q12 @L3

add r12,r12,r1

vst1.16 {q14},[r12] @Lr_p[d+3*D2]

vmin.s16 q11,q11,q14 @min(minL3,L3)

vqadd.s16 q13,q13,q14 @Sp[d]+L0+L1+L2+L3

vst1.16 {q13},[r6]! @Sp[d]

sub r12,r12,r9@Lr_p[d+3*D2]->Lr_p0[d+8]

sub r4,r4,r2@Lr_p3[d+1]->Lr_p1[d+8]

subs r8,r8,#1

bne .loop

vtrn.16       q8,q9        @L0,L1

vtrn.16       q10,q11        @L2,L3

vmin.s16      q8,q8,q9     

vmin.s16      q10,q10,q11

vmin.s16      d0,d16,d17

vmin.s16      d1,d20,d21

vshr.s64      q1,q0,#32

vmin.s16 q0,q0,q1

vtrn.32 d0,d1 @minL0,minL1,minL2,minL3

vst1.16 {q0},[r7]

vpop           {q4}

pop           {r4-r9}

bx            lr

.end

[email protected]:~/work/jni$ 

[email protected]:~/work/jni$ cat sgbm.cpp

#include "opencv2/core/core.hpp"

#include "opencv2/highgui/highgui.hpp"

#include "opencv2/imgproc/imgproc.hpp"

#include <iostream>

using namespace cv;

using namespace std;

typedef uchar PixType;

typedef short CostType;

typedef short DispType;

typedef cv::Point_<short> Point2s;

enum { NR = 16, NR2 = NR / 2 };

extern "C" void x_cost(short a,short* b,short* c,short* d,short*e,short*f,int g,short*h); 

extern "C" void cost_cal(short* a,short* b,short* c,const short* d,const short* e,short *f,int g,short* h);

extern "C" void l_cost(short a,short b,short c,short d,short* e,short* f,const short* g,short* h,short* i,int j);

extern "C" void cost_init(int a,int b,int c,int d,PixType *e,PixType*f,PixType*g,short*h,int i);

extern "C" void cost_final(int a,PixType * b,short* c,int d);

struct StereoSGBMParams

{

StereoSGBMParams()

{

minDisparity = numDisparities = 0;

SADWindowSize = 0;

P1 = P2 = 0;

disp12MaxDiff = 0;

preFilterCap = 0;

uniquenessRatio = 0;

speckleWindowSize = 0;

speckleRange = 0;

mode = 0;

}

StereoSGBMParams(int _minDisparity, int _numDisparities, int _SADWindowSize,

int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,

int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,

int _mode)

{

minDisparity = _minDisparity;

numDisparities = _numDisparities;

SADWindowSize = _SADWindowSize;

P1 = _P1;

P2 = _P2;

disp12MaxDiff = _disp12MaxDiff;

preFilterCap = _preFilterCap;

uniquenessRatio = _uniquenessRatio;

speckleWindowSize = _speckleWindowSize;

speckleRange = _speckleRange;

mode = _mode;

}

int minDisparity;

int numDisparities;

int SADWindowSize;

int preFilterCap;

int uniquenessRatio;

int P1;

int P2;

int speckleWindowSize;

int speckleRange;

int disp12MaxDiff;

int mode;

};

template <typename T>

void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)

{

using namespace cv;

int width = img.cols, height = img.rows, npixels = width*height;

size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));

if (!_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize)

_buf.create(1, (int)bufSize, CV_8U);

uchar* buf = _buf.ptr();

int i, j, dstep = (int)(img.step / sizeof(T));

int* labels = (int*)buf;

buf += npixels*sizeof(labels[0]);

Point2s* wbuf = (Point2s*)buf;

buf += npixels*sizeof(wbuf[0]);

uchar* rtype = (uchar*)buf;

int curlabel = 0;

// clear out label assignments

memset(labels, 0, npixels*sizeof(labels[0]));

for (i = 0; i < height; i++)

{

T* ds = img.ptr<T>(i);

int* ls = labels + width*i;

for (j = 0; j < width; j++)

{

if (ds[j] != newVal)   // not a bad disparity

{

if (ls[j])     // has a label, check for bad label

{

if (rtype[ls[j]]) // small region, zero out disparity

ds[j] = (T)newVal;

}

// no label, assign and propagate

else

{

Point2s* ws = wbuf; // initialize wavefront

Point2s p((short)j, (short)i);  // current pixel

curlabel++; // next label

int count = 0;  // current region size

ls[j] = curlabel;

// wavefront propagation

while (ws >= wbuf) // wavefront not empty

{

count++;

// put neighbors onto wavefront

T* dpp = &img.at<T>(p.y, p.x);

T dp = *dpp;

int* lpp = labels + width*p.y + p.x;

if (p.y < height - 1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff)

{

lpp[+width] = curlabel;

*ws++ = Point2s(p.x, p.y + 1);

}

if (p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff)

{

lpp[-width] = curlabel;

*ws++ = Point2s(p.x, p.y - 1);

}

if (p.x < width - 1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff)

{

lpp[+1] = curlabel;

*ws++ = Point2s(p.x + 1, p.y);

}

if (p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff)

{

lpp[-1] = curlabel;

*ws++ = Point2s(p.x - 1, p.y);

}

// pop most recent and propagate

// NB: could try least recent, maybe better convergence

p = *--ws;

}

// assign label type

if (count <= maxSpeckleSize)   // speckle region

{

rtype[ls[j]] = 1;   // small region label

ds[j] = (T)newVal;

}

else

rtype[ls[j]] = 0;   // large region label

}

}

}

}

}

void filterSpeckles(InputOutputArray _img, double _newval, int maxSpeckleSize,

double _maxDiff, InputOutputArray __buf)

{

Mat img = _img.getMat();

int type = img.type();

Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;

CV_Assert(type == CV_8UC1 || type == CV_16SC1);

int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);

#if IPP_VERSION_X100 >= 801

CV_IPP_CHECK()

{

Ipp32s bufsize = 0;

IppiSize roisize = { img.cols, img.rows };

IppDataType datatype = type == CV_8UC1 ? ipp8u : ipp16s;

if (!__buf.needed() && (type == CV_8UC1 || type == CV_16SC1))

{

IppStatus status = ippiMarkSpecklesGetBufferSize(roisize, datatype, CV_MAT_CN(type), &bufsize);

Ipp8u * buffer = ippsMalloc_8u(bufsize);

if ((int)status >= 0)

{

if (type == CV_8UC1)

status = ippiMarkSpeckles_8u_C1IR(img.ptr<Ipp8u>(), (int)img.step, roisize,

(Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer);

else

status = ippiMarkSpeckles_16s_C1IR(img.ptr<Ipp16s>(), (int)img.step, roisize,

(Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer);

}

if (status >= 0)

{

CV_IMPL_ADD(CV_IMPL_IPP);

return;

}

setIppErrorStatus();

}

}

#endif

if (type == CV_8UC1)

filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);

else

filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);

}

static void calcPixelCostBT(const Mat& img1, const Mat& img2, int y,

int minD, int maxD, CostType* cost,

PixType* buffer, const PixType* tab,

int tabOfs, int)

{

int x, c, width = img1.cols, cn = img1.channels();

int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);

int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);

int D = maxD - minD, width1 = maxX1 - minX1, width2 = maxX2 - minX2;

const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);

PixType *prow1 = buffer + width2 * 2, *prow2 = prow1 + width*cn * 2;

tab += tabOfs;

for (c = 0; c < cn * 2; c++)

{

prow1[width*c] = prow1[width*c + width - 1] =

prow2[width*c] = prow2[width*c + width - 1] = tab[0];

}

int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows - 1 ? (int)img1.step : 0;

int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows - 1 ? (int)img2.step : 0;

if (cn == 1)

{

for (x = 1; x < width - 1; x++)

{

prow1[x] = tab[(row1[x + 1] - row1[x - 1]) * 2 + row1[x + n1 + 1] - row1[x + n1 - 1] + row1[x + s1 + 1] - row1[x + s1 - 1]];

prow2[width - 1 - x] = tab[(row2[x + 1] - row2[x - 1]) * 2 + row2[x + n2 + 1] - row2[x + n2 - 1] + row2[x + s2 + 1] - row2[x + s2 - 1]];

prow1[x + width] = row1[x];

prow2[width - 1 - x + width] = row2[x];

}

}

else

{

for (x = 1; x < width - 1; x++)

{

prow1[x] = tab[(row1[x * 3 + 3] - row1[x * 3 - 3]) * 2 + row1[x * 3 + n1 + 3] - row1[x * 3 + n1 - 3] + row1[x * 3 + s1 + 3] - row1[x * 3 + s1 - 3]];

prow1[x + width] = tab[(row1[x * 3 + 4] - row1[x * 3 - 2]) * 2 + row1[x * 3 + n1 + 4] - row1[x * 3 + n1 - 2] + row1[x * 3 + s1 + 4] - row1[x * 3 + s1 - 2]];

prow1[x + width * 2] = tab[(row1[x * 3 + 5] - row1[x * 3 - 1]) * 2 + row1[x * 3 + n1 + 5] - row1[x * 3 + n1 - 1] + row1[x * 3 + s1 + 5] - row1[x * 3 + s1 - 1]];

prow2[width - 1 - x] = tab[(row2[x * 3 + 3] - row2[x * 3 - 3]) * 2 + row2[x * 3 + n2 + 3] - row2[x * 3 + n2 - 3] + row2[x * 3 + s2 + 3] - row2[x * 3 + s2 - 3]];

prow2[width - 1 - x + width] = tab[(row2[x * 3 + 4] - row2[x * 3 - 2]) * 2 + row2[x * 3 + n2 + 4] - row2[x * 3 + n2 - 2] + row2[x * 3 + s2 + 4] - row2[x * 3 + s2 - 2]];

prow2[width - 1 - x + width * 2] = tab[(row2[x * 3 + 5] - row2[x * 3 - 1]) * 2 + row2[x * 3 + n2 + 5] - row2[x * 3 + n2 - 1] + row2[x * 3 + s2 + 5] - row2[x * 3 + s2 - 1]];

prow1[x + width * 3] = row1[x * 3];

prow1[x + width * 4] = row1[x * 3 + 1];

prow1[x + width * 5] = row1[x * 3 + 2];

prow2[width - 1 - x + width * 3] = row2[x * 3];

prow2[width - 1 - x + width * 4] = row2[x * 3 + 1];

prow2[width - 1 - x + width * 5] = row2[x * 3 + 2];

}

}

memset(cost, 0, width1*D*sizeof(cost[0]));

buffer -= minX2;

cost -= minX1*D + minD; // simplify the cost indices inside the loop

#if 1

for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width)

{

int diff_scale = c < cn ? 0 : 2;

// precompute

//   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and

//   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and

for (x = minX2; x < maxX2; x++)

{

int v = prow2[x];

int vl = x > 0 ? (v + prow2[x - 1]) / 2 : v;

int vr = x < width - 1 ? (v + prow2[x + 1]) / 2 : v;

int v0 = std::min(vl, vr); v0 = std::min(v0, v);

int v1 = std::max(vl, vr); v1 = std::max(v1, v);

buffer[x] = (PixType)v0;

buffer[x + width2] = (PixType)v1;

}

for (x = minX1; x < maxX1; x++)

{

int u = prow1[x];

int ul = x > 0 ? (u + prow1[x - 1]) / 2 : u;

int ur = x < width - 1 ? (u + prow1[x + 1]) / 2 : u;

int u0 = std::min(ul, ur); u0 = std::min(u0, u);

int u1 = std::max(ul, ur); u1 = std::max(u1, u);

//assembly 

cost_init(u,u0,u1,-diff_scale,prow2+width - x - 1 + minD,buffer+width - x - 1 + minD,buffer+width - x - 1 + minD+width2,cost+x*D+minD,D);

}

}

#else

for (c = 0; c < cn * 2; c++, prow1 += width, prow2 += width)

{

for (x = minX1; x < maxX1; x++)

{

int u = prow1[x];

//assembly

cost_final(u,prow2+width - 1 - x +minD ,cost+x*D+minD,D);

}

}

#endif

}

static void computeDisparitySGBM(const Mat& img1, const Mat& img2,

Mat& disp1, const StereoSGBMParams& params,

Mat& buffer)

{

const int ALIGN = 16;

const int DISP_SHIFT = 4;

const int DISP_SCALE = (1 << DISP_SHIFT);

const CostType MAX_COST = SHRT_MAX;

int minD = params.minDisparity, maxD = minD + params.numDisparities;

Size SADWindowSize;

SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;

int ftzero = std::max(params.preFilterCap, 15) | 1;

int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;

int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;

int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1 + 1);

int k, width = disp1.cols, height = disp1.rows;

int minX1 = std::max(-maxD, 0), maxX1 = width + std::min(minD, 0);

int D = maxD - minD, width1 = maxX1 - minX1;

int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;

int SW2 = SADWindowSize.width / 2, SH2 = SADWindowSize.height / 2;

bool fullDP = params.mode == 1;

int npasses = fullDP ? 2 : 1;

const int TAB_OFS = 256 * 4, TAB_SIZE = 256 + TAB_OFS * 2;

PixType clipTab[TAB_SIZE];

short *minvalue=new short[12]; //for the use of x_cost.s

memset(minvalue,0,sizeof(short)*12);

short d8[8]={0,1,2,3,4,5,6,7};  

for (k = 0; k < TAB_SIZE; k++)

clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);

if (minX1 >= maxX1)

{

disp1 = Scalar::all(INVALID_DISP_SCALED);

return;

}

CV_Assert(D % 16 == 0);

// NR - the number of directions. the loop on x below that computes Lr assumes that NR == 8.

// if you change NR, please, modify the loop as well.

int D2 = D + 16, NRD2 = NR2*D2;

// the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:

// for 8-way dynamic programming we need the current row and

// the previous row, i.e. 2 rows in total

const int NLR = 2;

const int LrBorder = NLR - 1;

// for each possible stereo match (img1(x,y) <=> img2(x-d,y))

// we keep pixel difference cost (C) and the summary cost over NR directions (S).

// we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)

size_t costBufSize = width1*D;

size_t CSBufSize = costBufSize*(fullDP ? height : 1);

size_t minLrSize = (width1 + LrBorder * 2)*NR2, LrSize = minLrSize*D2;

int hsumBufNRows = SH2 * 2 + 2;

size_t totalBufSize = (LrSize + minLrSize)*NLR*sizeof(CostType) + // minLr[] and Lr[]

costBufSize*(hsumBufNRows + 1)*sizeof(CostType) + // hsumBuf, pixdiff

CSBufSize * 2 * sizeof(CostType) + // C, S

width * 16 * img1.channels()*sizeof(PixType) + // temp buffer for computing per-pixel cost

width*(sizeof(CostType) + sizeof(DispType)) + 1024; // disp2cost + disp2

if (buffer.empty() || !buffer.isContinuous() ||

buffer.cols*buffer.rows*buffer.elemSize() < totalBufSize)

buffer.create(1, (int)totalBufSize, CV_8U);

// summary cost over different (nDirs) directions

CostType* Cbuf = (CostType*)alignPtr(buffer.ptr(), ALIGN);

CostType* Sbuf = Cbuf + CSBufSize;

CostType* hsumBuf = Sbuf + CSBufSize;

CostType* pixDiff = hsumBuf + costBufSize*hsumBufNRows;

CostType* disp2cost = pixDiff + costBufSize + (LrSize + minLrSize)*NLR;

DispType* disp2ptr = (DispType*)(disp2cost + width);

PixType* tempBuf = (PixType*)(disp2ptr + width);

// add P2 to every C(x,y). it saves a few operations in the inner loops

for (k = 0; k < width1*D; k++)

Cbuf[k] = (CostType)P2;

for (int pass = 1; pass <= npasses; pass++)

{

int x1, y1, x2, y2, dx, dy;

if (pass == 1)

{

y1 = 0; y2 = height; dy = 1;

x1 = 0; x2 = width1; dx = 1;

}

else

{

y1 = height - 1; y2 = -1; dy = -1;

x1 = width1 - 1; x2 = -1; dx = -1;

}

CostType *Lr[NLR] = { 0 }, *minLr[NLR] = { 0 };

for (k = 0; k < NLR; k++)

{

// shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,

// and will occasionally use negative indices with the arrays

// we need to shift Lr[k] pointers by 1, to give the space for d=-1.

// however, then the alignment will be imperfect, i.e. bad for SSE,

// thus we shift the pointers by 8 (8*sizeof(short) == 16 - ideal alignment)

Lr[k] = pixDiff + costBufSize + LrSize*k + NRD2*LrBorder + 8;

memset(Lr[k] - LrBorder*NRD2 - 8, 0, LrSize*sizeof(CostType));

minLr[k] = pixDiff + costBufSize + LrSize*NLR + minLrSize*k + NR2*LrBorder;

memset(minLr[k] - LrBorder*NR2, 0, minLrSize*sizeof(CostType));

}

for (int y = y1; y != y2; y += dy)

{

int x, d;

DispType* disp1ptr = disp1.ptr<DispType>(y);

CostType* C = Cbuf + (!fullDP ? 0 : y*costBufSize);

CostType* S = Sbuf + (!fullDP ? 0 : y*costBufSize);

if (pass == 1) // compute C on the first pass, and reuse it on the second pass, if any.

{

int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;

for (k = dy1; k <= dy2; k++)

{

CostType* hsumAdd = hsumBuf + (std::min(k, height - 1) % hsumBufNRows)*costBufSize;

if (k < height)

{

calcPixelCostBT(img1, img2, k, minD, maxD, pixDiff, tempBuf, clipTab, TAB_OFS, ftzero);

memset(hsumAdd, 0, D*sizeof(CostType));

for (x = 0; x <= SW2*D; x += D)

{

int scale = x == 0 ? SW2 + 1 : 1;

for (d = 0; d < D; d++)

hsumAdd[d] = (CostType)(hsumAdd[d] + pixDiff[x + d] * scale);

}

if (y > 0)

{

const CostType* hsumSub = hsumBuf + (std::max(y - SH2 - 1, 0) % hsumBufNRows)*costBufSize;

const CostType* Cprev = !fullDP || y == 0 ? C : C - costBufSize;

for (x=D;x<(SW2+1)*D;x+=D)

{

const CostType* pixAdd = pixDiff + x + SW2*D;

const CostType* pixSub = pixDiff ;  

for( d = 0; d < D; d++ )

{

int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);

C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);

}

}

//assembly

cost_cal(pixDiff + (2*SW2+1)*D,pixDiff ,hsumAdd+SW2*D,Cprev+(SW2+1)*D,hsumSub+(SW2+1)*D,C+(SW2+1)*D,(width1-1-2*SW2)*D,hsumAdd+(SW2+1)*D);

for(x=(width1-SW2)*D;x<width1*D;x+=D)

{

const CostType* pixAdd = pixDiff + (width1-1)*D;

const CostType* pixSub = pixDiff + x - (SW2+1)*D;

for( d = 0; d < D; d++ )

{

int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);

C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);

}

}

}

else

{

for (x = D; x < width1*D; x += D)

{

const CostType* pixAdd = pixDiff + std::min(x + SW2*D, (width1 - 1)*D);

const CostType* pixSub = pixDiff + std::max(x - (SW2 + 1)*D, 0);

for (d = 0; d < D; d++)

hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);

}

}

}

if (y == 0)

{

int scale = k == 0 ? SH2 + 1 : 1;

for (x = 0; x < width1*D; x++)

C[x] = (CostType)(C[x] + hsumAdd[x] * scale);

}

}

// also, clear the S buffer

for (k = 0; k < width1*D; k++)

S[k] = 0;

}

// clear the left and the right borders

memset(Lr[0] - NRD2*LrBorder - 8, 0, NRD2*LrBorder*sizeof(CostType));

memset(Lr[0] + width1*NRD2 - 8, 0, NRD2*LrBorder*sizeof(CostType));

memset(minLr[0] - NR2*LrBorder, 0, NR2*LrBorder*sizeof(CostType));

memset(minLr[0] + width1*NR2, 0, NR2*LrBorder*sizeof(CostType));

for (x = x1; x != x2; x += dx)

{

int xm = x*NR2, xd = xm*D2;

int delta0 = minLr[0][xm - dx*NR2] + P2, delta1 = minLr[1][xm - NR2 + 1] + P2;

int delta2 = minLr[1][xm + 2] + P2, delta3 = minLr[1][xm + NR2 + 3] + P2;

CostType* Lr_p0 = Lr[0] + xd - dx*NRD2;

CostType* Lr_p1 = Lr[1] + xd - NRD2 + D2;

CostType* Lr_p2 = Lr[1] + xd + D2 * 2;

CostType* Lr_p3 = Lr[1] + xd + NRD2 + D2 * 3;

Lr_p0[-1] = Lr_p0[D] = Lr_p1[-1] = Lr_p1[D] =

Lr_p2[-1] = Lr_p2[D] = Lr_p3[-1] = Lr_p3[D] = MAX_COST;

//CostType* Lr_p = Lr[0] + xd;

const CostType* Cp = C + x*D;

CostType* Sp = S + x*D;

//assembly

l_cost((short)delta0,(short)delta1,(short)delta2,(short)delta3,Lr_p0,Lr_p1,Cp,Sp,minLr[0]+xm,D);

}

if (pass == npasses)

{

for (x = 0; x < width; x++)

{

disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;

disp2cost[x] = MAX_COST;

}

for (x = width1 - 1; x >= 0; x--)

{

CostType* Sp = S + x*D;

int minS = MAX_COST, bestDisp = -1;

if (npasses == 1)

{

int xm = x*NR2, xd = xm*D2;

//int minL0 = MAX_COST;

int delta0 = minLr[0][xm + NR2] + P2;

CostType* Lr_p0 = Lr[0] + xd + NRD2;

Lr_p0[-1] = Lr_p0[D] = MAX_COST;

//CostType* Lr_p = Lr[0] + xd;

CostType* Cp = C + x*D;

//assembly

x_cost((short)delta0,Cp,Lr_p0,Lr[0]+xd,Sp,d8,D,minvalue);

minLr[0][xm]=minvalue[0];

minS=minvalue[1];

int i=4;

while(minvalue[i]==0&&i<11)

{

i++;

}

bestDisp=minvalue[i];

}

else

{

for (d = 0; d < D; d++)

{

int Sval = Sp[d];

if (Sval < minS)

{

minS = Sval;

bestDisp = d;

}

}

}

for (d = 0; d < D; d++)

{

if (Sp[d] * (100 - uniquenessRatio) < minS * 100 && std::abs(bestDisp - d) > 1)

break;

}

if (d < D)

continue;

d = bestDisp;

int _x2 = x + minX1 - d - minD;

if (disp2cost[_x2] > minS)

{

disp2cost[_x2] = (CostType)minS;

disp2ptr[_x2] = (DispType)(d + minD);

}

if (0 < d && d < D - 1)

{

// do subpixel quadratic interpolation:

//   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])

//   then find minimum of the parabola.

int denom2 = std::max(Sp[d - 1] + Sp[d + 1] - 2 * Sp[d], 1);

d = d*DISP_SCALE + ((Sp[d - 1] - Sp[d + 1])*DISP_SCALE + denom2) / (denom2 * 2);

}

else

d *= DISP_SCALE;

disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);

}

for (x = minX1; x < maxX1; x++)

{

// we round the computed disparity both towards -inf and +inf and check

// if either of the corresponding disparities in disp2 is consistent.

// This is to give the computed disparity a chance to look valid if it is.

int d1 = disp1ptr[x];

if (d1 == INVALID_DISP_SCALED)

continue;

int _d = d1 >> DISP_SHIFT;

int d_ = (d1 + DISP_SCALE - 1) >> DISP_SHIFT;

int _x = x - _d, x_ = x - d_;

if (0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&

0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff)

disp1ptr[x] = (DispType)INVALID_DISP_SCALED;

}

}

// now shift the cyclic buffers

std::swap(Lr[0], Lr[1]);

std::swap(minLr[0], minLr[1]);

}

}

}

void compute(InputArray leftarr, InputArray rightarr, OutputArray disparr,int mindis,int numdis)

{

Mat left = leftarr.getMat(), right = rightarr.getMat();

CV_Assert(left.size() == right.size() && left.type() == right.type() &&

left.depth() == CV_8U);

disparr.create(left.size(), CV_16S);

Mat disp = disparr.getMat();

StereoSGBMParams params(mindis, numdis, 5, 200, 800, 10, 4, 1, 150, 2, 0);

Mat buffer;

computeDisparitySGBM(left, right, disp, params, buffer);

medianBlur(disp, disp, 3);

if (params.speckleWindowSize > 0)

filterSpeckles(disp, (params.minDisparity - 1) * 16, params.speckleWindowSize,

16 * params.speckleRange, buffer);

}

struct Data{

Mat img1,img2,disp;

int mindis,numdis;

};

void*do_compute(void*arg)

{ Data* data=(Data*)arg;

compute(data->img1,data->img2,data->disp,data->mindis,data->numdis);

pthread_exit(NULL);

}

void ComputeDisparity(Mat &image1,Mat &image2,Mat &disparity,int mindis,int numdis,int numthread=1)

{

if(numthread>1)

{

int height = image1.rows;

int width = image1.cols;

disparity.create(image1.size(),CV_16S);

int thread_height=height/numthread;

int d = height*0.07;

Data data[numthread];

Rect forseg[numthread];

for(int i=0;i<numthread-1;i++)

{

forseg[i]={0,i*thread_height,width,thread_height+d};

}

int final_height=(numthread-1)*thread_height;

forseg[numthread-1]={0,final_height,width,height-final_height};

for(int i=0;i<numthread;i++)

{

image1(forseg[i]).copyTo(data[i].img1);

image2(forseg[i]).copyTo(data[i].img2);

data[i].mindis=mindis;

data[i].numdis=numdis;

}

pthread_t p_thread[numthread-1];

for (int i=0;i<numthread-1;i++)

{

int err;

err=pthread_create(&p_thread[i],NULL,do_compute,&data[i+1]);

if(err!=0)

cerr<<"Create pthread failed!\n";

}

compute(data[0].img1,data[0].img2,data[0].disp,data[0].mindis,data[0].numdis);

for (int i=0;i<numthread-1;i++)

{

pthread_join(p_thread[i],NULL);

}

Rect formerge[numthread+1];

for(int i=0;i<numthread-2;i++)

{

formerge[i]={0,(i+1)*thread_height+d,width,thread_height};

}

int final_merge_height=(numthread-1)*thread_height+d;

formerge[numthread-2]={0,final_merge_height,width,height-final_merge_height};

formerge[numthread-1]={0,d,width,thread_height};

formerge[numthread]={0,d,width,height-final_merge_height};

data[0].disp.copyTo(disparity(forseg[0]));

for(int i=1;i<numthread-1;i++)

{

data[i].disp(formerge[numthread-1]).copyTo(disparity(formerge[i-1]));

}

data[numthread-1].disp(formerge[numthread]).copyTo(disparity(formerge[numthread-2]));

}

else

compute(image1,image2,disparity,mindis,numdis);

}

[email protected]:~/work/jni$ cat test.cpp

#include "opencv2/core/core.hpp"

#include "opencv2/highgui/highgui.hpp"

#include "opencv2/imgproc/imgproc.hpp"

#include <iostream>

using namespace std;

using namespace cv;

void ComputeDisparity(Mat &image1,Mat &image2,Mat &disparity,int mindis,int numdis,int numthread=1);

int main(){

Mat img1 = imread("tsukuba1color.png", 0);

Mat img2 = imread("tsukuba2color.png", 0);

Mat disparity,s;

for(int i=1;i<2;i++){

int64 t1=getTickCount();

ComputeDisparity(img1,img2,disparity,-16,16);

t1=getTickCount()-t1;

cout<<"1 threads:"<<t1/getTickFrequency()<<endl;

int64 t2=getTickCount();

ComputeDisparity(img1,img2,disparity,-16,16,2);

t2=getTickCount()-t2;

cout<<"2 threads:"<<t2/getTickFrequency()<<endl;

int64 t3=getTickCount();

ComputeDisparity(img1,img2,disparity,-16,16,3);

t3=getTickCount()-t3;

cout<<"3 threads:"<<t3/getTickFrequency()<<endl;

int64 t4=getTickCount();

ComputeDisparity(img1,img2,disparity,-16,16,4);

t4=getTickCount()-t4;

cout<<"4 threads:"<<t4/getTickFrequency()<<endl;

int64 t5=getTickCount();

ComputeDisparity(img1,img2,disparity,-16,16,5);

t5=getTickCount()-t5;

cout<<"5 threads:"<<t5/getTickFrequency()<<endl;

}

normalize(disparity, s, 0, 255, CV_MINMAX, CV_8U);

return 0;

}

[email protected]:~/work/jni$ 

[email protected]:~/work/jni$ cat x_cost.s 

.arm

.text

.global x_cost

x_cost:

push {r4-r6}

ldr r12,[sp,#12]@Sp

ldr r4,[sp,#16]@d8

ldr r5,[sp,#20] @D

ldr r6,[sp,#24] @minvalue

lsr r5,r5,#3

vdup.16       q0,r0       @delta0

mov           r0,#200     

vdup.16       q1,r0       @p1

mov         r0,#255

lsl    r0,r0,#7

add         r0,r0,#127

vdup.16 q9,r0    @minL0

vdup.16 q10,r0  @mins

mov r0,#-1

vdup.16 q11,r0  @bestdisp

vld1.16 {q12},[r4] @d8

mov r0,#8

vdup.16 q13,r0 @_8

.loop:

vld1.16       {q2},[r1]!    @Cp[d]

vld1.16       {q3},[r2]    @Lr_p0[d]

vmin.s16   q8,q3,q0 @min(delta0,Lr_p0[d])

sub r2,r2,#2

vld1.16 {q3},[r2] @Lr_p0[d-1]

vqadd.s16 q3,q3,q1 @Lr_p0[d-1]+P1

vmin.s16 q8,q8,q3 @min(delta0,Lr_p0[d],Lr_p0[d-1]+P1)

add r2,r2,#4

vld1.16 {q3},[r2] @Lr_p0[d+1]

vqadd.s16 q3,q3,q1 @Lr_p0[d+1]+P1

vmin.s16 q8,q8,q3 @min(delta0,Lr_p0[d],Lr_p0[d-1]+P1,Lr_p0[d+1]+P1)

vqsub.s16 q8,q8,q0 @minus delta0

vqadd.s16 q8,q8,q2 @L0

vst1.16 {q8},[r3]! @Lr_p[d]

vmin.s16 q9,q9,q8 @min(minL0,L0)

vld1.16 {q2},[r12] @Sp[d]

vqadd.s16 q2,q2,q8 @Sp[d]=Sp[d]+L0

vst1.16 {q2},[r12]! 

vcgt.s16 q3,q10,q2 @mask

vmin.s16 q10,q10,q2 @min(mins,sp[d])

veor q8,q11,q12@xor(bestdisp,d8)

vand q8,q8,q3

veor q11,q11,q8@bestdisp

vqadd.s16 q12,q12,q13 @d8+8

add r2,r2, #14

subs r5,r5,#1

bne         .loop

vmin.s16 d0,d18,d19

vmin.s16 d1,d20,d21

vtrn.16 d0,d1

vmin.s16 d0,d0,d1

vshr.s64 d2,d0,#32

vmin.s16 d0,d2,d0

vst1.16 {d0},[r6]!

vdup.16 q3,d0[1]

vceq.s16 q3,q10,q3 @qs

vand q2,q11,q3

vst1.16 {q2},[r6]!

pop           {r4-r6}

bx            lr

.end

[email protected]:~/work/jni$ ndk-build

Android NDK: OpenCV: You should ignore warning about 'non-system libraries in linker flags' and 'opencv_java' library.    

Android NDK:         'OPENCV_INSTALL_MODULES:=on' can be used to build APK with included OpenCV binaries    

Android NDK: WARNING:/home/archermind/work/jni/Android.mk:test: non-system libraries in linker flags: -lopencv_java3    

Android NDK:     This is likely to result in incorrect builds. Try using LOCAL_STATIC_LIBRARIES    

Android NDK:     or LOCAL_SHARED_LIBRARIES instead to list the library dependencies of the    

Android NDK:     current module    

[armeabi-v7a ] Compile arm    : test <= cost_cal.s

[armeabi-v7a ] Compile arm    : test <= cost_final.s

[armeabi-v7a ] Compile arm    : test <= cost_init.s

[armeabi-v7a ] Compile arm    : test <= l_cost.s

[armeabi-v7a ] Compile arm    : test <= x_cost.s

[armeabi-v7a ] Compile++ arm  : test <= test.cpp

[armeabi-v7a ] Compile++ arm  : test <= sgbm.cpp

[armeabi-v7a ] Executable     : test

[armeabi-v7a ] Install        : test => jni/test

[email protected]:~/work/jni$ 

在开发板上执行的结果

amt6757_wifi_n:/data # ./test

1 threads:0.104967

2 threads:0.0807542

3 threads:0.0708366

4 threads:0.0683645

5 threads:0.0744135

amt6757_wifi_n:/data #

我这里的左右图的大小都是 384X288

https://github.com/southmountain/SgbmwithNeon

文件提取链接

http://pan.baidu.com/s/1o8KF8tO

继续阅读