</pre><p>圖像預處理操作的90度整倍數旋轉、鏡像,都可由以下兩個基本操作組合得出,測試比單點操作速度提升約3-5倍。</p><p></p><p></p><pre code_snippet_id="1889082" snippet_file_name="blog_20160920_3_6201408" name="code" class="cpp">//鏡像-1*16單位元組矩陣
void MirrorMatrix(__m128i *pSrc_tmp ,__m128i *pDst_tmp , __m128i sort)
{
_mm_storeu_si128(pDst_tmp, _mm_shuffle_epi8(_mm_loadu_si128(pSrc_tmp),sort));
}
//轉置-16*16單位元組矩陣
void TransposeMatrix(__m128i** pSrc_tmp , __m128i** pDst_tmp)
{
int i,j,k,bitcount = 16;
__m128i src[16],dest[16]; //對應原始位址的資料
__m128i tlh1[16]; //前8位l,後8位h == tlh3再用此位址
__m128i tlh2[16]; //前4位l_l,後4位l_h,後4位h_l,後4位h_h
for (i = 0;i<16;i++)
{
src[i] = _mm_loadu_si128(pSrc_tmp[i]);
}
for (i = 0;i<8;i++)
{
tlh1[i] = _mm_unpacklo_epi8(src[i*2],src[i*2+1]);
tlh1[8+i] = _mm_unpackhi_epi8(src[i*2],src[i*2+1]);
}
k = 4;
for (i = 0;i<k;i++)
{
for (j = 0;j<2;j++)
{
tlh2[2*j*k+i] = _mm_unpacklo_epi16(tlh1[2*j*k+i*2],tlh1[2*j*k+i*2+1]);
tlh2[(2*j+1)*k+i] = _mm_unpackhi_epi16(tlh1[2*j*k+i*2],tlh1[2*j*k+i*2+1]);
}
}
k = 2;
for (i = 0;i<k;i++)
{
for (j = 0;j<4;j++)
{
tlh1[2*j*k+i] = _mm_unpacklo_epi32(tlh2[2*j*k+i*2],tlh2[2*j*k+i*2+1]);
tlh1[(2*j+1)*k+i] = _mm_unpackhi_epi32(tlh2[2*j*k+i*2],tlh2[2*j*k+i*2+1]);
}
}
for (j = 0;j<8;j++)
{
dest[2*j] = _mm_unpacklo_epi64(tlh1[2*j],tlh1[2*j+1]);
dest[2*j+1] = _mm_unpackhi_epi64(tlh1[2*j],tlh1[2*j+1]);
}
for (i = 0;i<16;i++)
{
_mm_storeu_si128(pDst_tmp[i], dest[i]);
}
}