圖形圖像處理－之－高品質的快速的圖像縮放補充使用SSE2優化

[email protected] 2011.04.12

tag: 圖像縮放,速度優化,線性插值,三次卷積插值,SSE2,scale,bilinear,bicubic,StretchBlt

摘要:

本文章對線性插值和三次卷積插值(bicubic)的實作做了一些新的優化嘗試;

使用了SSE2的128bit寄存器及相關指令;并預先建立SSE2用到的縮放系數表;

實作的結果在我的i7電腦上比以前的版本分别快出145%和75%!

線性插值的速度是StretchBlt的13倍!

正文:

(請先看看我的blog裡<高品質的快速的圖像縮放>的前3篇文章!)

支援SSE2指令集的CPU越來越多,CPU的SSE2實作性能也好了很多(以前不比MMX好多少),

而且軟體在64位模式的時候不再支援MMX,是以嘗試了SSE2的縮放優化,效果不錯!

速度測試說明:

隻測試記憶體資料到記憶體資料的縮放

測試圖檔都是800*600縮放到1024*768,單線程;fps表示每秒鐘的幀數,值越大表示函數越快.

速度測試對比: (CPU:i7 920 記憶體:DDR3 1333 3通道)

(windows)

StretchBlt 近鄰取樣 869.09 fps

StretchBlt 線性插值 44.46 fps //SetStretchBltMode(dc,4);?

PicZoom0: 95.69 fps

PicZoom1: 158.35 fps

PicZoom2: 332.78 fps

PicZoom3: 1172.79 fps

PicZoom3_float: 874.13 fps

PicZoom3_Table: 1158.30 fps

PicZoom3_SSE: 1908.40 fps

PicZoom_Bilinear0: 28.80 fps

PicZoom_Bilinear1: 56.09 fps

PicZoom_Bilinear2: 97.09 fps

PicZoom_Bilinear_Common: 119.83 fps

PicZoom_Bilinear_MMX: 180.12 fps

PicZoom_Bilinear_MMX_Ex: 237.34 fps

PicZoom_ftBilinear_Common: 118.67 fps

PicZoom_ftBilinear_MMX: 213.68 fps

PicZoom_ThreeOrder0: 6.11 fps

PicZoom_ThreeOrder_Common: 25.38 fps

PicZoom_ThreeOrder_MMX: 52.32 fps

(SSE2的實作)

PicZoom_ftBilinearTable_SSE2: 588.24 fps

PicZoom_ThreeOrderTable_SSE2: 93.24 fps

PicZoom_ftBilinearTable_SSE2實作代碼如下:

typedef UInt64 TMMXData64;

//ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext)

//void __declspec(naked) ftBilinearTable_SSE2(){

#define ftBilinearTable_SSE2() /

asm mov eax,[edx+ebx] /

asm movq xmm0,qword ptr[esi+eax*4] /

asm movq xmm1,qword ptr[ecx+eax*4] /

asm punpcklbw xmm0,xmm7 /

asm punpcklbw xmm1,xmm7 /

asm pmullw xmm0,mm5 /

asm pmullw xmm1,mm6 /

asm paddw xmm0,xmm1 /

asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] /

asm movdqa xmm1,xmm0 /

asm punpckhqdq xmm0,xmm0 /

asm paddw xmm0,xmm1 /

asm packuswb xmm0,xmm7 /

asm movd dword ptr [edi+ebx],xmm0

//ret //for __declspec(naked)

//}

//void __declspec(naked) ftBilinearTable_SSE2_expand2(){

#define ftBilinearTable_SSE2_expand2() /

asm mov eax,[edx+ebx] /

asm movq xmm0,qword ptr[esi+eax*4] /

asm movq xmm1,qword ptr[ecx+eax*4] /

asm mov eax,[edx+ebx+4] /

asm movq xmm2,qword ptr[esi+eax*4] /

asm movq xmm3,qword ptr[ecx+eax*4] /

asm punpcklbw xmm0,xmm7 /

asm punpcklbw xmm1,xmm7 /

asm punpcklbw xmm2,xmm7 /

asm punpcklbw xmm3,xmm7 /

asm pmullw xmm0,mm5 /

asm pmullw xmm1,mm6 /

asm pmullw xmm2,mm5 /

asm pmullw xmm3,mm6 /

asm paddw xmm0,xmm1 /

asm paddw xmm2,xmm3 /

asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] /

asm pmulhw xmm2,xmmword ptr [ebp+ebx*4+16] /

asm movdqa xmm1,xmm0 /

asm punpcklqdq xmm0,xmm2 /

asm punpckhqdq xmm1,xmm2 /

asm paddw xmm0,xmm1 /

asm packuswb xmm0,xmm7 /

asm movq qword ptr [edi+ebx],xmm0 /

//ret //for __declspec(naked)

//}

void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)

{

if ( (0==Dst.width)||(0==Dst.height)

||(2>Src.width)||(2>Src.height)) return;

long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;

long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;

long dst_width=Dst.width;

UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];

TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte對齊

Int32* xList=(Int32*)(uList+dst_width*2);

{//init u table

long srcx_16=0;

for (long x=0;x<dst_width*2;x+=2){

xList[x>>1]=(srcx_16>>16);

unsigned long u=(srcx_16>>8)&0xFF;

unsigned long ur=(256-u)<<1;

u=u<<1;

uList[x+0]=(ur|(ur<<16));

uList[x+0]|=uList[x+0]<<32;

uList[x+1]=u|(u<<16);

uList[x+1]|=uList[x+1]<<32;

srcx_16+=xrIntFloat_16;

}

Color32* pDstLine=Dst.pdata;

long srcy_16=0;

asm pxor xmm7,xmm7 //xmm7=0

for (long y=0;y<Dst.height;++y){

unsigned long v=(srcy_16>>8) & 0xFF;

unsigned long vr=(256-v)>>1;

v>>=1;

Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;

Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;

asm{

movd xmm5,vr

movd xmm6,v

punpcklwd xmm5,xmm5

punpcklwd xmm6,xmm6

punpckldq xmm5,xmm5

punpckldq xmm6,xmm6

punpcklqdq xmm5,xmm5

punpcklqdq xmm6,xmm6

mov esi,PSrcLineColor

mov ecx,PSrcLineColorNext

mov edx,xList //x

mov ebx,dst_width

mov edi,pDstLine

push ebp

mov ebp,uList

push ebx

and ebx,(not 1)

test ebx,ebx

jle end_loop2

lea ebx,[ebx*4]

lea edi,[edi+ebx]

lea edx,[edx+ebx]

lea ebp,[ebp+ebx*4]

neg ebx

loop2_start:

//call ftBilinearTable_SSE2_expand2

ftBilinearTable_SSE2_expand2()

add ebx,8

jnz loop2_start

end_loop2:

pop ebx

and ebx,1

test ebx,ebx

jle end_write

lea ebx,[ebx*4]

lea edi,[edi+ebx]

lea edx,[edx+ebx]

lea ebp,[ebp+ebx*4]

neg ebx

loop1_start:

//call ftBilinearTable_SSE2

ftBilinearTable_SSE2()

add ebx,4

jnz loop1_start

end_write:

pop ebp

}

srcy_16+=yrIntFloat_16;

((UInt8*&)pDstLine)+=Dst.byte_width;

}

delete []_bufMem;

}

PicZoom_ThreeOrderTable_SSE2實作代碼如下:

static TMMXData64 SinXDivX_Table64_MMX[(2<<8)+1];

class _CAutoInti_SinXDivX_Table64_MMX {

private:

void _Inti_SinXDivX_Table64_MMX()

{

for (long i=0;i<=(2<<8);++i)

{

unsigned short t=(unsigned short)(0.5+(1<<14)*SinXDivX(i*(1.0/(256))));

unsigned long tl=t|(((unsigned long)t)<<16);

TMMXData64 tll=tl|(((TMMXData64)tl)<<32);

SinXDivX_Table64_MMX[i]=tll;

}

};

public:

_CAutoInti_SinXDivX_Table64_MMX() { _Inti_SinXDivX_Table64_MMX(); }

};

static _CAutoInti_SinXDivX_Table64_MMX __tmp_CAutoInti_SinXDivX_Table64_MMX;

//void __declspec(naked) _private_ThreeOrderTable_Fast_SSE2_2(){

#define _private_ThreeOrderTable_Fast_SSE2_2() /

asm movq xmm0,qword ptr [eax] /

asm movq xmm1,qword ptr [eax+8] /

asm movq xmm2,qword ptr [eax+edx] /

asm movq xmm3,qword ptr [eax+edx+8] /

asm punpcklbw xmm0,xmm7 /

asm punpcklbw xmm1,xmm7 /

asm punpcklbw xmm2,xmm7 /

asm punpcklbw xmm3,xmm7 /

asm psllw xmm0,7 /

asm psllw xmm1,7 /

asm psllw xmm2,7 /

asm psllw xmm3,7 /

asm pmulhw xmm0,xmmword ptr [ecx] /

asm pmulhw xmm1,xmmword ptr [ecx+16] /

asm pmulhw xmm2,xmmword ptr [ecx] /

asm pmulhw xmm3,xmmword ptr [ecx+16] /

asm paddsw xmm0,xmm1 /

asm paddsw xmm2,xmm3 /

asm pmulhw xmm0,xmmword ptr [ebx] /

asm pmulhw xmm2,xmmword ptr [ebx+16] /

asm paddsw xmm0,xmm2 /

//asm ret //for __declspec(naked)

//}

must_inline UInt32 ThreeOrderTable_Fast_SSE2(const Color32* pixel,long byte_width,const TMMXData64* v4,const TMMXData64* u4){

asm mov eax,pixel

asm mov edx,byte_width

asm mov ebx,v4

asm mov ecx,u4

//asm call _private_ThreeOrderTable_Fast_SSE2_2

_private_ThreeOrderTable_Fast_SSE2_2();

asm movdqa xmm6,xmm0

asm lea eax,[eax+edx*2] //+pic.byte_width

asm lea ebx,[ebx+32]

//asm call _private_ThreeOrderTable_Fast_SSE2_2

_private_ThreeOrderTable_Fast_SSE2_2();

asm paddsw xmm6,xmm0

asm movdqa xmm5,xmm6

asm psrldq xmm6,8 //srl 8*8 bit!

asm paddsw xmm5,xmm6

asm psraw xmm5,3

asm packuswb xmm5,xmm7

asm movd eax,xmm5

}

must_inline long getSizeBorder(long x,long maxx){

if (x<=0)

return 0;

else if (x>=maxx)

return maxx;

else

return x;

}

must_inline UInt32 ThreeOrderTable_Border_SSE2(const TPixels32Ref& pic,const long x0_sub1,const long y0_sub1,const TMMXData64* v4,const TMMXData64* u4){

Color32 pixel[16];

long height_sub_1=pic.height-1;

long width_sub_1=pic.width-1;

Color32* pbuf=pixel;

for (long i=0;i<4;++i,pbuf+=4){

long y=getSizeBorder(y0_sub1+i,height_sub_1);

Color32* pLine=pic.getLinePixels(y);

pbuf[0]=pLine[getSizeBorder(x0_sub1+0,width_sub_1)];

pbuf[1]=pLine[getSizeBorder(x0_sub1+1,width_sub_1)];

pbuf[2]=pLine[getSizeBorder(x0_sub1+2,width_sub_1)];

pbuf[3]=pLine[getSizeBorder(x0_sub1+3,width_sub_1)];

}

return ThreeOrderTable_Fast_SSE2(pixel,4*sizeof(Color32),v4,u4);

}

void PicZoom_ThreeOrderTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)

{

if ( (0==Dst.width)||(0==Dst.height)

||(0==Src.width)||(0==Src.height)) return;

long dst_width=Dst.width;

long dst_height=Dst.height;

long xrIntFloat_16=((Src.width)<<16)/dst_width+1;

long yrIntFloat_16=((Src.height)<<16)/dst_height+1;

const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);

const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);

//計算出需要特殊處理的邊界

long border_y0=((1<<16)-csDErrorY)/yrIntFloat_16+1;//y0+y*yr>=1; y0=csDErrorY => y>=(1-csDErrorY)/yr

if (border_y0>=dst_height) border_y0=dst_height;

long border_x0=((1<<16)-csDErrorX)/xrIntFloat_16+1;

if (border_x0>=dst_width ) border_x0=dst_width;

long border_y1=(((Src.height-3)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-3) => y<=(height-3-csDErrorY)/yr

if (border_y1<border_y0) border_y1=border_y0;

long border_x1=(((Src.width-3)<<16)-csDErrorX)/xrIntFloat_16+1;;

if (border_x1<border_x0) border_x1=border_x0;

UInt8* _bufMem=new UInt8[(dst_width*4*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];

TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte對齊

Int32* xList=(Int32*)(uList+dst_width*4);

{//init u table

long srcx_16=csDErrorX;

for (long x=0;x<dst_width*4;x+=4){

xList[x>>2]=(srcx_16>>16)-1;

long u=(srcx_16>>8)&0xFF;

uList[x+0]=SinXDivX_Table64_MMX[256+u];

uList[x+1]=SinXDivX_Table64_MMX[u];

uList[x+2]=SinXDivX_Table64_MMX[256-u];

uList[x+3]=SinXDivX_Table64_MMX[512-u];

srcx_16+=xrIntFloat_16;

}

TMMXData64 _v4[8+2];

TMMXData64* v4=(&_v4[0]); v4=(TMMXData64*)( (((ptrdiff_t)v4)+15)>>4<<4);

asm pxor xmm7,xmm7

Color32* pDstLine=Dst.pdata;

long srcy_16=csDErrorY;

for (long y=0;y<dst_height;++y){

//v table

const long srcy_sub1=(srcy_16>>16)-1;

const long v=(srcy_16>>8)&0xFF;

v4[0]=SinXDivX_Table64_MMX[256+v];

v4[1]=v4[0];

v4[2]=SinXDivX_Table64_MMX[v];

v4[3]=v4[2];

v4[4]=SinXDivX_Table64_MMX[256-v];

v4[5]=v4[4];

v4[6]=SinXDivX_Table64_MMX[512-v];

v4[7]=v4[6];

if ((y<border_y0)||(y>=border_y1)){

for (long x=0;x<dst_width;++x)

pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]); //border

}else{

for (long x=0;x<border_x0;++x)

pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border

const Color32* pixelLine=Src.getLinePixels(srcy_sub1);

long byte_width=Src.byte_width;

for (long x=border_x0;x<border_x1;++x)

pDstLine[x].argb=ThreeOrderTable_Fast_SSE2(&pixelLine[xList[x]],byte_width,v4,&uList[x*4]);//fast MMX !

for (long x=border_x1;x<dst_width;++x)

pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border

}

srcy_16+=yrIntFloat_16;

((UInt8*&)pDstLine)+=Dst.byte_width;

}

delete []_bufMem;

}

圖形圖像處理－之－高品質的快速的圖像縮放補充使用SSE2優化

繼續閱讀

JPEG壓縮技術的原理

invalid byte 1 of 1-byte UTF-8 sequence

出現invalid byte 1 of 1-byte UTF-8 sequence問題

Python3 色情圖檔識别 Python 3 色情圖檔識别

手機遊戲優化技巧

如何存取圖檔到資料庫(SQL Server)

轉詳解C#資料庫存取圖檔三大方式

SOFTICE 使用說明 (斷點)

DOS常用指令的使用

BMP檔案結構及圖像每行位元組計算方法

磁盤結構及在Linux中的命名

處理PCX檔案

如何用Opencv求圖像的灰階投影曲線

IE8 CSS設定DIV居中，添加“margin:0 auto”

Small tricks

無元件上傳圖檔到資料庫中，最完整解決方案

圖形圖像處理－之－高品質的快速的圖像縮放 補充 使用SSE2優化

繼續閱讀

圖形圖像處理－之－高品質的快速的圖像縮放補充使用SSE2優化