linux系統調用sched_setaffinity核心實作分析

在講解具體實作之前，我們先來看一個場景：

啟動一個程序，并且在啟動的時候就綁定該程序運作的CPU，如下圖（我們是在KVM中啟動一個虛拟機，該虛拟機啟動時就進行了CPU綁定）：

可以看到虛拟機程序vCPU确實被綁定在了0-11,24-35号CPU上，下面我們使用taskset修改該程序綁定的CPU上，比如修改為12-23，結果如下：

linux系統調用sched_setaffinity核心實作分析

可以看到上面設定失敗，于是我們程式設計再次嘗試一下，程式如下：

#include<stdlib.h>
#include<stdio.h>
#include<sys/types.h>
#include<sys/sysinfo.h>
#include<unistd.h>
#define __USE_GNU
#include<sched.h>
#include<ctype.h>
#include<string.h>

int main(int argc, char const *argv[]){
    if(argc<2){
        printf("參數錯誤\n");
        exit(-1);
    }
    char str[128]="error";
    cpu_set_t mask;  //CPU核的集合
    pid_t pid=atoi(argv[1]);
    int i,res=0;
    int cpu_nums=sysconf(_SC_NPROCESSORS_CONF);//擷取系統CPU總數
    printf("pid=%d\tcpu_nums=%d\n",pid,cpu_nums);
    CPU_ZERO(&mask);    //置空
    sched_getaffinity(pid, sizeof(mask),&mask);//擷取程序目前的CPU親和性
    //列印程序CPU親和性
    printf("程序 %d 使用了下面的CPU：\n",pid);    
    for(i=0;i<cpu_nums;i++){
        if(CPU_ISSET(i,&mask)){
            printf("%d ",i);
        }
    }
    printf("\n");
    //接下來，嘗試設定程序CPU親和性為12-23,36-47
    CPU_ZERO(&mask);    //置空
    for(i=12;i<=23;i++){
	    CPU_SET(i,&mask);   //設定親和力值
    }
    for(i=36;i<=47;i++){
	    CPU_SET(i,&mask);   //設定親和力值
    }
    res=sched_setaffinity(pid, sizeof(mask), &mask);//執行設定
    if(res!=0){
        printf("設定程序親和性失敗\n");
        perror(str);//列印錯誤原因
    }else{
	    printf("設定程序親和性成功!\n");
    }
    CPU_ZERO(&mask); //置空
    sched_getaffinity(pid, sizeof(mask), &mask);
    printf("程序 %d 使用了下面的CPU：\n",pid);    
    for(i=0;i<cpu_nums;i++){
        if(CPU_ISSET(i,&mask)){
            printf("%d ",i);
        }
    }
    printf("\n");
    return 0;
}

程式運作的結果如下：

linux系統調用sched_setaffinity核心實作分析

其實和taskset的結果一樣。然後我們嘗試将程序的CPU綁定為0-11，運作結果如下：

linux系統調用sched_setaffinity核心實作分析

結果也是不允許。那為什麼會出現這樣的結果呢？下面我們來看一下sched_setaffinity()函數在linux核心中的實作。

注：核心代碼版本：3.14.69

核心層sched_setaffinity實作如下：

/**
 * sys_sched_setaffinity - set the cpu affinity of a process  設定一個程序的CPU親和性
 * @pid: pid of the process  程序編号PID
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr  使用者層掩碼所占的位元組長度
 * @user_mask_ptr: user-space pointer to the new cpu get_user_cpu_mask   指向新的要修改的掩碼的使用者層指針
 *
 * Return: 0 on success. An error code otherwise.
 */
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
{
    cpumask_var_t new_mask;
    int retval;
    if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))//在核心中配置設定一個cpumask_var_t的結構體
        return -ENOMEM;

    //将使用者層的user_mask_ptr參數拷貝到核心中，也就是将使用者設定的親和性參數拷貝到核心中
    retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);//将使用者空間的user_mask_ptr的值拷貝到核心層的new_mask中
    if (retval == 0){//如果使用者參數設定是合法的
        retval = sched_setaffinity(pid, new_mask);
    }
    free_cpumask_var(new_mask);//釋放核心中申請的cpumask_var_t結構體
    return retval;
}

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
    cpumask_var_t cpus_allowed, new_mask;
    struct task_struct *p;
    int retval, i;
    rcu_read_lock();

    p = find_process_by_pid(pid);//根據程序PID擷取程序結構體task_struct
    if (!p)
    {
        rcu_read_unlock();
        return -ESRCH;
    }

    /* Prevent p going away */
    get_task_struct(p);//使程序結構體task_struct的使用計數加1，放置在修改CPU親和性期間task_struct被撤銷
    rcu_read_unlock();
    //根據PF_NO_SETAFFINITY判斷使用者空間是否可以修改cpus_allowed屬性
    if (p->flags & PF_NO_SETAFFINITY)
    {
        retval = -EINVAL;
        goto out_put_task;
    }
    //在核心空間中配置設定一個cpumask_var_t結構體
    if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
    {
        retval = -ENOMEM;//not enough space錯誤
        goto out_put_task;
    }
    if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
    {
        retval = -ENOMEM; //not enough space錯誤
        goto out_free_cpus_allowed;
    }
    retval = -EPERM;//operation not permitted錯誤
    if (!check_same_owner(p))//判斷目前程序的uid和要修改親和性的目标程序的uid是否相等，相等傳回1，否則傳回0
    {
        rcu_read_lock();
        if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))//權限檢查
        {
            rcu_read_unlock();
            goto out_unlock;
        }
        rcu_read_unlock();
    }
    //判斷是否允許修改排程政策
    retval = security_task_setscheduler(p);
    if (retval)
    {
        goto out_unlock;
    }
    //擷取程序cpuset的cpus_allowed，并将結果存于cpus_allowed變量中
    cpuset_cpus_allowed(p, cpus_allowed);
    //将程序原來的cpus_allowed與使用者設定的cpus_allowed相與，并将結果存于new_mask變量中
    cpumask_and(new_mask, in_mask, cpus_allowed); //*new_mask = *in_mask & *cpus_allowed

    /*
	 * Since bandwidth control happens on root_domain basis,
	 * if admission test is enabled, we only admit -deadline
	 * tasks allowed to run on all the CPUs in the task's
	 * root_domain.
	 */
#ifdef CONFIG_SMP
    if (task_has_dl_policy(p) && dl_bandwidth_enabled())
    {
        rcu_read_lock();
        if (!cpumask_subset(task_rq(p)->rd->span, new_mask))
        {
            retval = -EBUSY;
            rcu_read_unlock();
            goto out_unlock;
        }
        rcu_read_unlock();
    }
#endif
again:
    retval = set_cpus_allowed_ptr(p, new_mask);

    if (!retval)//如果前面的操作成功
    {
        cpuset_cpus_allowed(p, cpus_allowed);//擷取程序目前cpuset的cpus_allowed
        if (!cpumask_subset(new_mask, cpus_allowed))//如果new_mask不是cpus_allowed的子集
        {
            /*
			 * We must have raced with a concurrent cpuset
			 * update. Just reset the cpus_allowed to the
			 * cpuset's cpus_allowed
			 */
            cpumask_copy(new_mask, cpus_allowed); //*new_mask = *cpus_allowed，這裡其實就是保證你設定的cpus_allowed需要是該程序cpuset中的cpus_allowed的子集
            goto again;
        }
    }
out_unlock:
    free_cpumask_var(new_mask);//釋放記憶體
out_free_cpus_allowed:
    free_cpumask_var(cpus_allowed);//釋放記憶體
out_put_task:
    put_task_struct(p);//減少使用計數
    return retval;
}

/*
 * Change a given task's CPU affinity. Migrate the thread to a
 * proper CPU and schedule it away if the CPU it's executing on
 * is removed from the allowed bitmask.
 *
 * NOTE: the caller must have a valid reference to the task, the
 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.
 */
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
    unsigned long flags;
    struct rq *rq;
    unsigned int dest_cpu;
    int ret = 0, i;
    rq = task_rq_lock(p, &flags);
    //判斷new_mask與程序的cpus_allowed是否相等，相等的話就沒有必要重新設定，就可以退出
    if (cpumask_equal(&p->cpus_allowed, new_mask))
    {
        goto out;
    }
    //判斷設定的CPU親和性與活躍的CPU是否有交集，有交集則cpumask_intersects()傳回1，否則傳回0
    if (!cpumask_intersects(new_mask, cpu_active_mask)) //  !((*new_mask & *cpu_active_mask) != 0)
    {
        //如果設定的new_mask和cpu_active_mask沒有交集，退出，并傳回錯誤值
        ret = -EINVAL;//invalid argument錯誤
        goto out;
    }

    //這一步在有宏定義CONFIG_SMP時才有操作，否則是一個空函數。
    do_set_cpus_allowed(p, new_mask); // 該函數設定程序結構體的cpus_allowed以及修改程序結構體中的nr_cpus_allowed的值

    /* Can the task run on the task's current CPU? If so, we're done */
    if (cpumask_test_cpu(task_cpu(p), new_mask))
    {
        goto out;
    }

    //從 *cpu_active_mask & *new_mask 結果中随機挑選一個CPU
    dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
    if (p->on_rq)
    {
        struct migration_arg arg = {p, dest_cpu};
        /* Need help from migration thread: drop lock and wait. */
        task_rq_unlock(rq, p, &flags);
        stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
        tlb_migrate_finish(p->mm);
        return 0;
    }
out:
    task_rq_unlock(rq, p, &flags);
    return ret;
}

在sched_setaffinity()調用中，傳回的錯誤是：operation not permitted，錯誤應該是出現在security_task_setscheduler(p);函數的調用中。

另外，從上面函數的分析過程中，我們可以看到，使用者設定的要親和的CPU需要和系統中活躍的CPU有交集，而且最終設定的是和cpuset中的cpus_allowed相與的結果。設定成功時，最終設定的結果也應該是cpuset中cpus_allowed的子集。

下一步就是要分析security_task_setscheduler(p)函數在有CPU綁定和沒有CPU綁定時的一個差別。

未完待續。

linux系統調用sched_setaffinity核心實作分析

繼續閱讀

linux記憶體源碼分析 - 記憶體壓縮(同步關系)概述　頁面遷移

linux記憶體源碼分析 - 記憶體壓縮(同步關系)概述　頁面遷移

Linux Mutex機制分析ifdef CONFIG_MUTEX_SPIN_ON_OWNERendififdef CONFIG_DEBUG_MUTEXESendififdef CONFIG_DEBUG_LOCK_ALLOCendif

Linus Trovalds談Linux核心開發管理風格

本來想分析linux下的網絡應用什麼情況下會發送RST封包。也查到一些資料，但還是想通過linux核心源碼佐證一下。不過

linux核心源碼解析- 匿名頁面生命周期

linux核心源碼解析–page資料結構

Linux核心中的記憶體屏障

Linux核心中的鎖

今天在查詢資料的時候，發現一個非常好的Linux核心學習的項目，特分享給大家，大家一起進步位址：網頁連結内容包括：1.2

Linux記憶體管理(4) - 不連續頁的配置設定vmalloc

談談NAPI機制1. 引入問題2. 幾個關鍵函數3. 舊的收包接口netif_rx4. NAPI機制5. 在中斷期間處理多幀

linux核心網絡代碼學習之碎片整理篇

負載均衡-指定程序/線程運作的CPU-(affinity setup)

Linux核心中的連結清單

c++ 中斷_Linux學習第23節，什麼是中斷？從C源碼分析核心如何設計和實作它中斷的基本概念Linux 核心中的中斷處理程式注冊中斷處理程式Linux 核心是如何執行中斷處理程式的Linux 核心是如何釋放中斷處理程式的