天天看点

操作系统提供了将进程或者线程绑定到某一颗 CPU 上运行的能力 提高提升多核 CPU 下的缓存命中率

参考 陶辉极客时间 https://time.geekbang.org/column/article/230194

代码示例

#include "stdio.h"
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <iostream>
#include <unistd.h>
#include <pthread.h>
#include <sched.h>


using namespace std;

#define TESTN 16*1024L

bool setaffinity = false;

void* loopcalc(void* args) {
	if (setaffinity) {
		cpu_set_t mask;  //CPU核的集合
		cpu_set_t get;	 //获取在集合中的CPU
		//获取线程的序列号
		int *thread_num = (int *)args; 

		//将当前线程绑定至特定CPU
		CPU_ZERO(&mask);  
	    CPU_SET(*thread_num,&mask); 
		if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
		{
			cout<<"warning: could not set CPU affinity, continuing...\n";
		}
	}
	timeval tStart,tEnd;
	//这里不再使用clock,因为clock表示的进程所占用过的CPU周期,它将所有CPU都计入了,不适合示例中的统计
	gettimeofday(&tStart, 0);

	//这个循环中由于反复访问有限的数组,CPU缓存命中率非常高
	unsigned char* arr = new unsigned char[TESTN];
	for (long i = 0; i < TESTN; i++) arr[i] = rand() % 256;
	for (int j = 1; j < TESTN; j++) {
		for (long i = 0; i < TESTN; i++) arr[i] += 1;
	}

	gettimeofday(&tEnd, 0);

	//将消耗时间传出到timecost数组中对应的元素上
	*(long*)args = (1000000LL * (tEnd.tv_sec-tStart.tv_sec) + (tEnd.tv_usec-tStart.tv_usec))/1000;
}

int main(int argc, char** argv) {
	int threadnum = 2;
	int ch;
	while((ch = getopt(argc, argv, "t:fs")) != -1) {
		switch(ch)
		{
			//设置测试的并发线程数,注意不要超过机器上的CPU核数
		   case 't':
			  threadnum = atoi(optarg);
			  break;
		   //将线程绑定至特定CPU上
		   case 'f':
		      setaffinity = true;
			  break;
		   //不绑定CPU
		   case 's':
		   	  setaffinity = false;
			  break;
		}
	}

	pthread_t* id = new pthread_t[threadnum];
	//统计每个线程计算所需要的时间
	long* timecost = new long[threadnum];
	for(int i = 0; i < threadnum; i++) {
		//最初timecost用于传递线程号,用于绑定CPU
		timecost[i] = i;
		int ret=pthread_create(&id[i],NULL,loopcalc,&timecost[i]); 
		
		if(ret!=0){ 
			cout<<"Create pthread error!\n"; 
			exit (1); 
		} 
	}

	long costsum = 0;
	//等待所有线程结束 
	for(int i = 0; i < threadnum; i++) {
		pthread_join(id[i],NULL);
		costsum += timecost[i];
	}
	//比较平均每线程所用时间
	cout<<"costsum: "<<costsum<<", avg: "<<costsum/threadnum<<endl;
}
           

1. 验证环境

操作系统: CentOS7.0

CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz

GCC-C++: 4.8.5

JAVA: 1.8.0

Python: 2.7.5

2. C++程序traverse_2d_array.cpp

a. 编译程序

安装编译依赖的软件

如Linux中需要安装gcc-c++,CentOS中可用yum install gcc-c++安装,Ubuntu中可用apt-get install gcc-c++

编译程序

g++ cpu_migrate.cpp -o cpu_migrate -lpthread

注意,多线程依赖pthread库,编译时需要链接

b. 运行验证

使用14个(共28个CPU核心)并发线程测试,不绑定CPU

./cpu_migrate -t 14 -s 平均每线程消耗时间(毫秒):1083

使用14个(共28个CPU核心)并发线程测试,绑定CPU

./cpu_migrate -t 14 -f 平均每线程消耗时间(毫秒):926

c. 使用perf验证缓存命中率

使用14个(共28个CPU核心)并发线程测试,不绑定CPU

perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -s

输出结果:

 Performance counter stats for './cpu_migrate -t 14 -s':

                10      cpu-migrations

         8,193,825      cache-references                                              (44.40%)

           175,792      cache-misses              #    2.145 % of all cache refs      (44.34%)

    45,480,238,906      instructions              #    1.30  insn per cycle           (55.47%)

    35,111,144,560      cycles                                                        (55.47%)

        11,997,428      L1-dcache-load-misses     #    0.05% of all L1-dcache hits    (55.57%)

    26,407,960,253      L1-dcache-loads                                               (55.60%)

         2,459,766      L1-icache-load-misses                                         (55.66%)

         2,136,304      branch-load-misses                                            (44.53%)

     3,825,848,726      branch-loads                                                  (44.43%)

       1.251076337 seconds time elapsed

      14.630618000 seconds user

       0.459616000 seconds sys

使用14个(共28个CPU核心)并发线程测试,绑定CPU

perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -f

输出结果:

 Performance counter stats for './cpu_migrate -t 14 -f':

                14      cpu-migrations

         4,983,541      cache-references                                              (44.42%)

         1,611,627      cache-misses              #   32.339 % of all cache refs      (44.34%)

    45,523,818,723      instructions              #    1.52  insn per cycle           (55.43%)

    29,972,627,158      cycles                                                        (55.46%)

         5,812,831      L1-dcache-load-misses     #    0.02% of all L1-dcache hits    (55.53%)

    26,388,005,477      L1-dcache-loads                                               (55.58%)

         1,262,533      L1-icache-load-misses                                         (55.66%)

         1,363,376      branch-load-misses                                            (44.54%)

     3,828,570,015      branch-loads                                                  (44.47%)

       0.948650967 seconds time elapsed

      12.489932000 seconds user

       0.456253000 seconds sys

继续阅读