Resize源码详解(参考Opencv4.1)

2023-03-29 10:05:32

inline uint16x8_t v_pack(const uint32x4_t& a, const uint32x4_t& b) 
{ 
    uint16x4_t a1 = vqmovn_u32(a), b1 = vqmovn_u32(b); 
    return uint16x8_t(vcombine_u16(a1, b1)); 
}

uint16x4_t = vqmovn_u32(const uint32x4_t) 饱和截断每个lane值为原来的一半

uint16x8_t = vcombine_u16(const uint16x4_t, const uint16x4_t) 组合两个16x4得到16x8

inline uint8x16_t v_rshr_pack_u(const int16x8_t& a, const int16x8_t& b)
{ 
	uint8x8_t a1 = vqrshrun_n_s16(a, 2);
	uint8x8_t b1 = vqrshrun_n_s16(b, 2);
    return uint8x16_t(vcombine_u8(a1, b1));
}

uint8x8_t = vqrshrun_n_s16(const int16x8_t, int a) 右移每个值a位，然后饱和截断为原来的一半

inline int16x8_t v_mul_hi(const int16x8_t& a, const int16x8_t& b)
{
	return int16x8_t(vcombine_s16(
		vshrn_n_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b)), 16),
		vshrn_n_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b)), 16)
	));
}

int16x4_t = vshrn_n_s32(const int32x4_t,int a) 右移a位截断

int32x4_t = vmull_s16(const int16x4_t,const int16x4_t)

struct VResizeLinearVec_32s8u
{
    int operator()(const unsigned char** _src, unsigned char* dst, const unsigned char* _beta, int width ) const
    {
        const int** src = (const int**)_src;
        const short* beta = (const short*)_beta;
        const int *S0 = src[0], *S1 = src[1];
        int x = 0;
        int16x8_t b0 = int16x8_t((vdupq_n_s16(beta[0]))), b1 = int16x8_t((vdupq_n_s16(beta[1])));

        if( (((size_t)S0|(size_t)S1)&(SIMD_WIDTH - 1)) == 0 )
            for( ; x <= width - u8_nlanes; x += u8_nlanes)
                vst1q_u8(dst + x, v_rshr_pack_u(v_mul_hi(v_pack(vld1q_s32(S0 + x                 ) >> 4, vld1q_s32(S0 + x +     i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x                 ) >> 4, vld1q_s32(S1 + x +     i32_nlanes) >> 4), b1),
                                                v_mul_hi(v_pack(vld1q_s32(S0 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S0 + x + 3 * i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S1 + x + 3 * i32_nlanes) >> 4), b1)));
        else
            for( ; x <= width - u8_nlanes; x += u8_nlanes)
                vst1q_u8(dst + x, v_rshr_pack_u(v_mul_hi(v_pack(vld1q_s32(S0 + x                 ) >> 4, vld1q_s32(S0 + x +     i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x                 ) >> 4, vld1q_s32(S1 + x +     i32_nlanes) >> 4), b1),
                                                v_mul_hi(v_pack(vld1q_s32(S0 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S0 + x + 3 * i32_nlanes) >> 4), b0) +
                                                v_mul_hi(v_pack(vld1q_s32(S1 + x + 2 * i32_nlanes) >> 4, vld1q_s32(S1 + x + 3 * i32_nlanes) >> 4), b1)));

            for( ; x < width - i16_nlanes; x += i16_nlanes)
                v_rshr_pack_u_store(dst + x, v_mul_hi(v_pack(vld1q_s32(S0 + x) >> 4, vld1q_s32(S0 + x + i32_nlanes) >> 4), b0) +
                                             v_mul_hi(v_pack(vld1q_s32(S1 + x) >> 4, vld1q_s32(S1 + x + i32_nlanes) >> 4), b1));

        return x;
    }
};

注：opencv的resize先计算horizon方向的临时结果，然后计算v方向得到最终的值，一次计算一行的16个值，由于数据采用定点计算方式，数据流从int32x4_t加载，到乘积之后的int16x4_t，最后导入变为int8x8_t

附上别的博客对一些指令的说明：https://blog.csdn.net/fuwenyan/article/details/78811034

Resize源码详解(参考Opencv4.1)

继续阅读

C语言第四章自述2第四章选择结构程序设计

面试题:vector和map的区别，异同。空间分布，100万数据存哪个比较合适。一、迭代器区别二、vector三、Map、Set四、vector_map 为什么比map效率高五、如何选择六、容器选择原则七、效率对比

C++ 多线程用条件变量确定线程的执行顺序而不是使用 sleep(1)

POJ 1284 Primitive Roots (欧拉函数&原根定理)

CQ V1.0分词bates(基于双数组tire树)—应该是目前最快的中文分词算法

成员函数初始化列表

2021-08-13c++——类之操作符重载

swmm与lisflood-fp源码如何一起编译 CMake命令

Windows下VS开发环境环境安装工程项目设置关于Debug和Release的提示

一文看懂字符串的加减乘除

C++ 第十五周报告1--《冒泡法排序》

C++实现简单顺序表

C经典书籍笔记——C陷阱与缺陷②(语法陷阱之优先级)一、错误案列二、优先级规律

线性表之顺序表的实现

C++判断素数、求最大公约数代码判断一个数是否为素数求两个数的最大公约数

SequoiaDB巨杉数据库C++驱动概述