圖形圖像處理-之-高品質的快速的圖像縮放 補充 使用SSE2優化
[email protected] 2011.04.12
tag: 圖像縮放,速度優化,線性插值,三次卷積插值,SSE2,scale,bilinear,bicubic,StretchBlt
摘要:
本文章對線性插值和三次卷積插值(bicubic)的實作做了一些新的優化嘗試;
使用了SSE2的128bit寄存器及相關指令;并預先建立SSE2用到的縮放系數表;
實作的結果在我的i7電腦上比以前的版本分别快出145%和75%!
線性插值的速度是StretchBlt的13倍!
正文:
(請先看看我的blog裡<高品質的快速的圖像縮放>的前3篇文章!)
支援SSE2指令集的CPU越來越多,CPU的SSE2實作性能也好了很多(以前不比MMX好多少),
而且軟體在64位模式的時候不再支援MMX,是以嘗試了SSE2的縮放優化,效果不錯!
速度測試說明:
隻測試記憶體資料到記憶體資料的縮放
測試圖檔都是800*600縮放到1024*768,單線程;fps表示每秒鐘的幀數,值越大表示函數越快.
速度測試對比: (CPU:i7 920 記憶體:DDR3 1333 3通道)
(windows)
StretchBlt 近鄰取樣 869.09 fps
StretchBlt 線性插值 44.46 fps //SetStretchBltMode(dc,4);?
PicZoom0: 95.69 fps
PicZoom1: 158.35 fps
PicZoom2: 332.78 fps
PicZoom3: 1172.79 fps
PicZoom3_float: 874.13 fps
PicZoom3_Table: 1158.30 fps
PicZoom3_SSE: 1908.40 fps
PicZoom_Bilinear0: 28.80 fps
PicZoom_Bilinear1: 56.09 fps
PicZoom_Bilinear2: 97.09 fps
PicZoom_Bilinear_Common: 119.83 fps
PicZoom_Bilinear_MMX: 180.12 fps
PicZoom_Bilinear_MMX_Ex: 237.34 fps
PicZoom_ftBilinear_Common: 118.67 fps
PicZoom_ftBilinear_MMX: 213.68 fps
PicZoom_ThreeOrder0: 6.11 fps
PicZoom_ThreeOrder_Common: 25.38 fps
PicZoom_ThreeOrder_MMX: 52.32 fps
(SSE2的實作)
PicZoom_ftBilinearTable_SSE2: 588.24 fps
PicZoom_ThreeOrderTable_SSE2: 93.24 fps
PicZoom_ftBilinearTable_SSE2實作代碼如下:
typedef UInt64 TMMXData64;
//ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext)
//void __declspec(naked) ftBilinearTable_SSE2(){
#define ftBilinearTable_SSE2() /
asm mov eax,[edx+ebx] /
asm movq xmm0,qword ptr[esi+eax*4] /
asm movq xmm1,qword ptr[ecx+eax*4] /
asm punpcklbw xmm0,xmm7 /
asm punpcklbw xmm1,xmm7 /
asm pmullw xmm0,mm5 /
asm pmullw xmm1,mm6 /
asm paddw xmm0,xmm1 /
asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] /
asm movdqa xmm1,xmm0 /
asm punpckhqdq xmm0,xmm0 /
asm paddw xmm0,xmm1 /
asm packuswb xmm0,xmm7 /
asm movd dword ptr [edi+ebx],xmm0
//ret //for __declspec(naked)
//}
//}
//void __declspec(naked) ftBilinearTable_SSE2_expand2(){
#define ftBilinearTable_SSE2_expand2() /
asm mov eax,[edx+ebx] /
asm movq xmm0,qword ptr[esi+eax*4] /
asm movq xmm1,qword ptr[ecx+eax*4] /
asm mov eax,[edx+ebx+4] /
asm movq xmm2,qword ptr[esi+eax*4] /
asm movq xmm3,qword ptr[ecx+eax*4] /
asm punpcklbw xmm0,xmm7 /
asm punpcklbw xmm1,xmm7 /
asm punpcklbw xmm2,xmm7 /
asm punpcklbw xmm3,xmm7 /
asm pmullw xmm0,mm5 /
asm pmullw xmm1,mm6 /
asm pmullw xmm2,mm5 /
asm pmullw xmm3,mm6 /
asm paddw xmm0,xmm1 /
asm paddw xmm2,xmm3 /
asm pmulhw xmm0,xmmword ptr [ebp+ebx*4] /
asm pmulhw xmm2,xmmword ptr [ebp+ebx*4+16] /
asm movdqa xmm1,xmm0 /
asm punpcklqdq xmm0,xmm2 /
asm punpckhqdq xmm1,xmm2 /
asm paddw xmm0,xmm1 /
asm packuswb xmm0,xmm7 /
asm movq qword ptr [edi+ebx],xmm0 /
//ret //for __declspec(naked)
//}
//}
void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte對齊
Int32* xList=(Int32*)(uList+dst_width*2);
{//init u table
long srcx_16=0;
for (long x=0;x<dst_width*2;x+=2){
xList[x>>1]=(srcx_16>>16);
unsigned long u=(srcx_16>>8)&0xFF;
unsigned long ur=(256-u)<<1;
u=u<<1;
uList[x+0]=(ur|(ur<<16));
uList[x+0]|=uList[x+0]<<32;
uList[x+1]=u|(u<<16);
uList[x+1]|=uList[x+1]<<32;
srcx_16+=xrIntFloat_16;
}
}
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
asm pxor xmm7,xmm7 //xmm7=0
for (long y=0;y<Dst.height;++y){
unsigned long v=(srcy_16>>8) & 0xFF;
unsigned long vr=(256-v)>>1;
v>>=1;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;
Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;
asm{
movd xmm5,vr
movd xmm6,v
punpcklwd xmm5,xmm5
punpcklwd xmm6,xmm6
punpckldq xmm5,xmm5
punpckldq xmm6,xmm6
punpcklqdq xmm5,xmm5
punpcklqdq xmm6,xmm6
mov esi,PSrcLineColor
mov ecx,PSrcLineColorNext
mov edx,xList //x
mov ebx,dst_width
mov edi,pDstLine
push ebp
mov ebp,uList
push ebx
and ebx,(not 1)
test ebx,ebx
jle end_loop2
lea ebx,[ebx*4]
lea edi,[edi+ebx]
lea edx,[edx+ebx]
lea ebp,[ebp+ebx*4]
neg ebx
loop2_start:
//call ftBilinearTable_SSE2_expand2
ftBilinearTable_SSE2_expand2()
add ebx,8
jnz loop2_start
end_loop2:
pop ebx
and ebx,1
test ebx,ebx
jle end_write
lea ebx,[ebx*4]
lea edi,[edi+ebx]
lea edx,[edx+ebx]
lea ebp,[ebp+ebx*4]
neg ebx
loop1_start:
//call ftBilinearTable_SSE2
ftBilinearTable_SSE2()
add ebx,4
jnz loop1_start
end_write:
pop ebp
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
delete []_bufMem;
}
PicZoom_ThreeOrderTable_SSE2實作代碼如下:
static TMMXData64 SinXDivX_Table64_MMX[(2<<8)+1];
class _CAutoInti_SinXDivX_Table64_MMX {
private:
void _Inti_SinXDivX_Table64_MMX()
{
for (long i=0;i<=(2<<8);++i)
{
unsigned short t=(unsigned short)(0.5+(1<<14)*SinXDivX(i*(1.0/(256))));
unsigned long tl=t|(((unsigned long)t)<<16);
TMMXData64 tll=tl|(((TMMXData64)tl)<<32);
SinXDivX_Table64_MMX[i]=tll;
}
};
public:
_CAutoInti_SinXDivX_Table64_MMX() { _Inti_SinXDivX_Table64_MMX(); }
};
static _CAutoInti_SinXDivX_Table64_MMX __tmp_CAutoInti_SinXDivX_Table64_MMX;
//void __declspec(naked) _private_ThreeOrderTable_Fast_SSE2_2(){
#define _private_ThreeOrderTable_Fast_SSE2_2() /
asm movq xmm0,qword ptr [eax] /
asm movq xmm1,qword ptr [eax+8] /
asm movq xmm2,qword ptr [eax+edx] /
asm movq xmm3,qword ptr [eax+edx+8] /
asm punpcklbw xmm0,xmm7 /
asm punpcklbw xmm1,xmm7 /
asm punpcklbw xmm2,xmm7 /
asm punpcklbw xmm3,xmm7 /
asm psllw xmm0,7 /
asm psllw xmm1,7 /
asm psllw xmm2,7 /
asm psllw xmm3,7 /
asm pmulhw xmm0,xmmword ptr [ecx] /
asm pmulhw xmm1,xmmword ptr [ecx+16] /
asm pmulhw xmm2,xmmword ptr [ecx] /
asm pmulhw xmm3,xmmword ptr [ecx+16] /
asm paddsw xmm0,xmm1 /
asm paddsw xmm2,xmm3 /
asm pmulhw xmm0,xmmword ptr [ebx] /
asm pmulhw xmm2,xmmword ptr [ebx+16] /
asm paddsw xmm0,xmm2 /
//asm ret //for __declspec(naked)
//}
must_inline UInt32 ThreeOrderTable_Fast_SSE2(const Color32* pixel,long byte_width,const TMMXData64* v4,const TMMXData64* u4){
asm mov eax,pixel
asm mov edx,byte_width
asm mov ebx,v4
asm mov ecx,u4
//asm call _private_ThreeOrderTable_Fast_SSE2_2
_private_ThreeOrderTable_Fast_SSE2_2();
asm movdqa xmm6,xmm0
asm lea eax,[eax+edx*2] //+pic.byte_width
asm lea ebx,[ebx+32]
//asm call _private_ThreeOrderTable_Fast_SSE2_2
_private_ThreeOrderTable_Fast_SSE2_2();
asm paddsw xmm6,xmm0
asm movdqa xmm5,xmm6
asm psrldq xmm6,8 //srl 8*8 bit!
asm paddsw xmm5,xmm6
asm psraw xmm5,3
asm packuswb xmm5,xmm7
asm movd eax,xmm5
}
must_inline long getSizeBorder(long x,long maxx){
if (x<=0)
return 0;
else if (x>=maxx)
return maxx;
else
return x;
}
must_inline UInt32 ThreeOrderTable_Border_SSE2(const TPixels32Ref& pic,const long x0_sub1,const long y0_sub1,const TMMXData64* v4,const TMMXData64* u4){
Color32 pixel[16];
long height_sub_1=pic.height-1;
long width_sub_1=pic.width-1;
Color32* pbuf=pixel;
for (long i=0;i<4;++i,pbuf+=4){
long y=getSizeBorder(y0_sub1+i,height_sub_1);
Color32* pLine=pic.getLinePixels(y);
pbuf[0]=pLine[getSizeBorder(x0_sub1+0,width_sub_1)];
pbuf[1]=pLine[getSizeBorder(x0_sub1+1,width_sub_1)];
pbuf[2]=pLine[getSizeBorder(x0_sub1+2,width_sub_1)];
pbuf[3]=pLine[getSizeBorder(x0_sub1+3,width_sub_1)];
}
return ThreeOrderTable_Fast_SSE2(pixel,4*sizeof(Color32),v4,u4);
}
void PicZoom_ThreeOrderTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if ( (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long dst_width=Dst.width;
long dst_height=Dst.height;
long xrIntFloat_16=((Src.width)<<16)/dst_width+1;
long yrIntFloat_16=((Src.height)<<16)/dst_height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
//計算出需要特殊處理的邊界
long border_y0=((1<<16)-csDErrorY)/yrIntFloat_16+1;//y0+y*yr>=1; y0=csDErrorY => y>=(1-csDErrorY)/yr
if (border_y0>=dst_height) border_y0=dst_height;
long border_x0=((1<<16)-csDErrorX)/xrIntFloat_16+1;
if (border_x0>=dst_width ) border_x0=dst_width;
long border_y1=(((Src.height-3)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-3) => y<=(height-3-csDErrorY)/yr
if (border_y1<border_y0) border_y1=border_y0;
long border_x1=(((Src.width-3)<<16)-csDErrorX)/xrIntFloat_16+1;;
if (border_x1<border_x0) border_x1=border_x0;
UInt8* _bufMem=new UInt8[(dst_width*4*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte對齊
Int32* xList=(Int32*)(uList+dst_width*4);
{//init u table
long srcx_16=csDErrorX;
for (long x=0;x<dst_width*4;x+=4){
xList[x>>2]=(srcx_16>>16)-1;
long u=(srcx_16>>8)&0xFF;
uList[x+0]=SinXDivX_Table64_MMX[256+u];
uList[x+1]=SinXDivX_Table64_MMX[u];
uList[x+2]=SinXDivX_Table64_MMX[256-u];
uList[x+3]=SinXDivX_Table64_MMX[512-u];
srcx_16+=xrIntFloat_16;
}
}
TMMXData64 _v4[8+2];
TMMXData64* v4=(&_v4[0]); v4=(TMMXData64*)( (((ptrdiff_t)v4)+15)>>4<<4);
asm pxor xmm7,xmm7
Color32* pDstLine=Dst.pdata;
long srcy_16=csDErrorY;
for (long y=0;y<dst_height;++y){
//v table
const long srcy_sub1=(srcy_16>>16)-1;
const long v=(srcy_16>>8)&0xFF;
v4[0]=SinXDivX_Table64_MMX[256+v];
v4[1]=v4[0];
v4[2]=SinXDivX_Table64_MMX[v];
v4[3]=v4[2];
v4[4]=SinXDivX_Table64_MMX[256-v];
v4[5]=v4[4];
v4[6]=SinXDivX_Table64_MMX[512-v];
v4[7]=v4[6];
if ((y<border_y0)||(y>=border_y1)){
for (long x=0;x<dst_width;++x)
pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]); //border
}else{
for (long x=0;x<border_x0;++x)
pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border
const Color32* pixelLine=Src.getLinePixels(srcy_sub1);
long byte_width=Src.byte_width;
for (long x=border_x0;x<border_x1;++x)
pDstLine[x].argb=ThreeOrderTable_Fast_SSE2(&pixelLine[xList[x]],byte_width,v4,&uList[x*4]);//fast MMX !
for (long x=border_x1;x<dst_width;++x)
pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
delete []_bufMem;
}