天天看點

AVX SSE 性能實驗AVX SSE 性能實驗

AVX SSE 性能實驗

代碼

···

#include “math_function.h”

float MathMulAdd(const float *input1, const float *input2, int size)

{

float output = 0.0;

for (int i = 0; i < size; i++)

{

output += input1[i] * input2[i];

}

return output;

}

float SSEMulAdd(const float *input1, const float *input2, int size)

{

if (input1 == nullptr || input2 == nullptr)

{

printf(“input data is null\n”);

return -1;

}

int nBlockWidth = 4;

int cntBlock = size / nBlockWidth;

int cntRem = size % nBlockWidth;

float output = 0;
__m128 loadData1, loadData2;
__m128 mulData = _mm_setzero_ps();
__m128 sumData = _mm_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;

for (int i = 0; i < cntBlock; i++)
{
	loadData1 = _mm_load_ps(p1);
	loadData2 = _mm_load_ps(p2);
	mulData = _mm_mul_ps(loadData1, loadData2);
	sumData = _mm_add_ps(sumData, mulData);
	p1 += nBlockWidth;
	p2 += nBlockWidth;
}
sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
//output += sumData.m128_f32[(0)];         // 前4組 Windows下這樣處理是一個聯合體
           

const float *q;

q = (const float *)&sumData;

output += q[0];

//數組長度16位元組對齊(這部分是非對其

for (int i = 0; i < cntRem; i++)

{

output += p1[i] * p2[i];

}

return output;
           

}

float SSEFmAdd(const float *input1, const float *input2, int size)

{

if (input1 == nullptr || input2 == nullptr)

{

printf(“input data is null\n”);

return -1;

}

int nBlockWidth = 4;

int cntBlock = size / nBlockWidth;

int cntRem = size % nBlockWidth;

float output = 0;
__m128 loadData1, loadData2;
//__m128 mulData = _mm_setzero_ps();
__m128 sumData = _mm_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
	loadData1 = _mm_load_ps(p1);
	loadData2 = _mm_load_ps(p2);
	//mulData = _mm_mul_ps(loadData1, loadData2);
	//sumData = _mm_add_ps(sumData, mulData);
	sumData = _mm_fmadd_ps(loadData1, loadData2, sumData);
	p1 += nBlockWidth;
	p2 += nBlockWidth;
}
sumData = _mm_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + ...
sumData = _mm_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + ...
//output += sumData.m128_f32[(0)];         // 前4組
           

const float *q;

q = (const float *)&sumData;

output += q[0];

for (int i = 0; i < cntRem; i++)

{

output += p1[i] * p2[i];

}

return output;
           

}

float AVXMulAdd(const float *input1, const float *input2, int size)

{

if (input1 == nullptr || input2 == nullptr)

{

printf(“input data is null\n”);

return -1;

}

int nBlockWidth = 8;

int cntBlock = size / nBlockWidth;

int cntRem = size % nBlockWidth;

float output = 0;
__m256 loadData1, loadData2;
__m256 mulData = _mm256_setzero_ps();
__m256 sumData = _mm256_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
	loadData1 = _mm256_loadu_ps(p1);
	loadData2 = _mm256_loadu_ps(p2);
	mulData = _mm256_mul_ps(loadData1, loadData2);
	sumData = _mm256_add_ps(sumData, mulData);
	p1 += nBlockWidth;
	p2 += nBlockWidth;
}
sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ... 
sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ... 
           

const float *q;

q = (const float *)&sumData;

//output += q[0];

output += q[0]; // 前4組

output += q[4]; // 後4組

for (int i = 0; i < cntRem; i++)
{
	output += p1[i] * p2[i];
}

return output;
           

}

float AVXFmAdd(const float *input1, const float *input2, int size)

{

if (input1 == nullptr || input2 == nullptr)

{

printf(“input data is null\n”);

return -1;

}

int nBlockWidth = 8;

int cntBlock = size / nBlockWidth;

int cntRem = size % nBlockWidth;

float output = 0;
__m256 loadData1, loadData2;
//__m256 mulData = _mm256_setzero_ps();
__m256 sumData = _mm256_setzero_ps();
const float *p1 = input1;
const float *p2 = input2;
for (int i = 0; i < cntBlock; i++)
{
	loadData1 = _mm256_loadu_ps(p1);
	loadData2 = _mm256_loadu_ps(p2);
	//mulData = _mm256_mul_ps(loadData1, loadData2);
	//sumData = _mm256_add_ps(sumData, mulData);
	sumData = _mm256_fmadd_ps(loadData1, loadData2, sumData);
	p1 += nBlockWidth;
	p2 += nBlockWidth;
}
sumData = _mm256_hadd_ps(sumData, sumData); // p[0] + p[1] + p[4] + p[5] + p[8] + p[9] + p[12] + p[13] + ... 
sumData = _mm256_hadd_ps(sumData, sumData); // p[2] + p[3] + p[6] + p[7] + p[10] + p[11] + p[14] + p[15] + ... 
           

const float *q;

q = (const float *)&sumData;

//output += q[0];

output += q[0]; // 前4組

output += q[4]; // 後4組

for (int i = 0; i < cntRem; i++)
{
	output += p1[i] * p2[i];
}

return output;
           

}

···

main.cpp測試代碼

#include "math_function.h"
#include <random>
#include <time.h>
 
using std::default_random_engine;
using std::uniform_real_distribution;
 
int main(int argc, char* argv[])
{
	int size = 325400;
   int align_size = 32;
	float *x1 = (float *)malloc(sizeof(float) * (align_size + size)) ;
	float *x2 = (float *)malloc(sizeof(float) * (align_size + size)) ;
                                    
   float *input1 = (float *)((((unsigned long)x1) + 127)/align_size*align_size);
   float *input2 = (float *)((((unsigned long)x2) + 127)/align_size*align_size);
	default_random_engine e;
	uniform_real_distribution<float> u(0, 1); //随機數分布對象 
	for (int i = 0; i < size; i++)
	{
		input1[i] = u(e);
		input2[i] = u(e);
	}
 
	int cntLoop = 1;
 
	clock_t start_t = clock();
	float org = 0.0;
	for (int i = 0; i < cntLoop; i++)
		org = MathMulAdd(input1, input2, size);
   clock_t end_t = clock();
	printf("org = %f\t", org);
	printf("cost time: %d(ms)\n", end_t - start_t);
 
	start_t = clock();
	float sse = 0.0;
	for (int i = 0; i < cntLoop; i++)
		sse = SSEMulAdd(input1, input2, size);
   end_t = clock();
	printf("sse = %f\t", sse);
	printf("cost time: %d(ms)\n", end_t - start_t);
 
	start_t = clock();
	float sse_ = 0.0;
	for (int i = 0; i < cntLoop; i++)
		sse_ = SSEFmAdd(input1, input2, size);
   end_t = clock();
	printf("sse_= %f\t", sse_);
	printf("cost time: %d(ms)\n", end_t - start_t);
 
	start_t = clock();
	float avx = 0.0;
	for (int i = 0; i < cntLoop; i++)
		avx = AVXMulAdd(input1, input2, size);
   end_t = clock();
	printf("avx = %f\t", avx);
	printf("cost time: %d(ms)\n", end_t - start_t);
 
	start_t = clock();
	float avx_ = 0.0;
	for (int i = 0; i < cntLoop; i++)
		avx_ = AVXFmAdd(input1, input2, size);
   end_t = clock();
	printf("avx_= %f\t", avx_);
	printf("cost time: %d(ms)\n", end_t - start_t);
 
	//getchar();
	free(x1);
	free(x2);
 
	return 0;
}
           

編譯

g++ -O3 -mavx -march=native -m64 -Wall -std=c++11 main.cpp math_function.cpp math_function.h -o cpu_opt

測試結果

  • avx load對齊
    AVX SSE 性能實驗AVX SSE 性能實驗
  • avx load不對齊
    AVX SSE 性能實驗AVX SSE 性能實驗

結論

  • 128 sse 速度差不多達到了普通算法的4倍 (理論值是4倍)
  • 256 avx 速度達到了普通算法的7倍多(理論值是8倍)
  • 對于avx而言對不對齊速度上差距不大

繼續閱讀