天天看点

Resize源码详解(参考Opencv4.1)

inline uint16x8_t v_pack(const uint32x4_t& a, const uint32x4_t& b) 
{ 
    uint16x4_t a1 = vqmovn_u32(a), b1 = vqmovn_u32(b); 
    return uint16x8_t(vcombine_u16(a1, b1)); 
} 
           

uint16x4_t = vqmovn_u32(const uint32x4_t)  饱和截断每个lane值为原来的一半

uint16x8_t = vcombine_u16(const uint16x4_t, const uint16x4_t) 组合两个16x4得到16x8

inline uint8x16_t v_rshr_pack_u(const int16x8_t& a, const int16x8_t& b)
{ 
	uint8x8_t a1 = vqrshrun_n_s16(a, 2);
	uint8x8_t b1 = vqrshrun_n_s16(b, 2);
    return uint8x16_t(vcombine_u8(a1, b1));
} 
           

uint8x8_t = vqrshrun_n_s16(const int16x8_t, int a) 右移每个值a位,然后饱和截断为原来的一半

inline int16x8_t v_mul_hi(const int16x8_t& a, const int16x8_t& b)
{
	return int16x8_t(vcombine_s16(
		vshrn_n_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b)), 16),
		vshrn_n_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b)), 16)
	));
}
           

int16x4_t  = vshrn_n_s32(const int32x4_t,int a) 右移a位截断

int32x4_t = vmull_s16(const int16x4_t,const int16x4_t)

struct VResizeLinearVec_32s8u
{
    int operator()(const unsigned char** _src, unsigned char* dst, const unsigned char* _beta, int width ) const
    {
        const int** src = (const int**)_src;
        const short* beta = (const short*)_beta;
        const int *S0 = src[0], *S1 = src[1];
        int x = 0;
        int16x8_t b0 = int16x8_t((vdupq_n_s16(beta[0]))), b1 = int16x8_t((vdupq_n_s16(beta[1])));

        if( (((size_t)S0|(size_t)S1)&(SIMD_WIDTH - 1)) == 0 )
            for( ; x <= width - u8_nlanes; x += u8_nlanes)
                vst1q_u8(dst + x, v_rshr_pack_u(v_mul_hi(v_pack(vld1q_s32(S0 + x                 ) >> 4, vld1q_s32(S0 + x +     i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x                 ) >> 4, vld1q_s32(S1 + x +     i32_nlanes) >> 4), b1),
                                                v_mul_hi(v_pack(vld1q_s32(S0 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S0 + x + 3 * i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S1 + x + 3 * i32_nlanes) >> 4), b1)));
        else
            for( ; x <= width - u8_nlanes; x += u8_nlanes)
                vst1q_u8(dst + x, v_rshr_pack_u(v_mul_hi(v_pack(vld1q_s32(S0 + x                 ) >> 4, vld1q_s32(S0 + x +     i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x                 ) >> 4, vld1q_s32(S1 + x +     i32_nlanes) >> 4), b1),
                                                v_mul_hi(v_pack(vld1q_s32(S0 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S0 + x + 3 * i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S1 + x + 3 * i32_nlanes) >> 4), b1)));

            for( ; x < width - i16_nlanes; x += i16_nlanes)
                v_rshr_pack_u_store(dst + x, v_mul_hi(v_pack(vld1q_s32(S0 + x) >> 4, vld1q_s32(S0 + x + i32_nlanes) >> 4), b0) +
                                             v_mul_hi(v_pack(vld1q_s32(S1 + x) >> 4, vld1q_s32(S1 + x + i32_nlanes) >> 4), b1));

        return x;
    }
};
           

注:opencv的resize先计算horizon方向的临时结果,然后计算v方向得到最终的值,一次计算一行的16个值,由于数据采用定点计算方式,数据流从int32x4_t加载,到乘积之后的int16x4_t,最后导入变为int8x8_t

附上别的博客对一些指令的说明:https://blog.csdn.net/fuwenyan/article/details/78811034

继续阅读