程序排程四　linux CFS排程器

一、CFS排程器結構：

思想：

　　理想狀态下每個程序都能獲得相同的時間片，并且同時運作在CPU上，但實際上一個CPU同一

時刻運作的程序隻能有一個。也就是說，當一個程序占用CPU時，其他程序就必須等待。CFS為了實

現公平，必須懲罰目前正在運作的程序，以使那些正在等待的程序下次被排程。

1、CFS運作隊列：

　　每個CPU都有自己的運作隊列，對應不同的排程器也有自己的運作隊列，管理CFS排程的的隊列為cfs_rq：

struct cfs_rq {
	struct load_weight load;
	unsigned int nr_running, h_nr_running;

	u64 min_vruntime;
}

成員	描叙
load	`struct load_weight` 用來記錄權重資訊
nr_runing	就緒隊列上排程實體的個數
min_vruntime	就緒隊列上所有排程實體的最小虛拟時間

2、CFS排程實體：

　　排程的實體用于描叙排程的具體資訊，包含排程的權重、虛拟運作時間等：

struct sched_entity {
	struct load_weight	load;		/* for load-balancing */
	struct rb_node		run_node;
	struct list_head	group_node;
	unsigned int		on_rq;

	u64			exec_start;
	u64			sum_exec_runtime;
	u64			vruntime;
	u64			prev_sum_exec_runtime;

	u64			nr_migrations;
	struct sched_entity	*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq		*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq		*my_q;
}

成員	描叙
load	權重資訊
run_node	CFS排程器的每個就緒隊列維護了一顆紅黑樹，run_node是挂載點
on_rq	排程實體se加入就緒隊列後，on_rq置1。從就緒隊列删除後，on_rq置0
vruntime	排程實體已經運作的虛拟時間總和

3、CFS排程類：

　　排程類是程序排程的具體實作，是程序排程的具體實作，包含入隊、出隊、搶占判斷、下一個運作

程序的選擇等：

const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
	.yield_to_task		= yield_to_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_from		= switched_from_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

	.update_curr		= update_curr_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_move_group	= task_move_group_fair,
#endif
};

（１）CFS程序的建立：

　　do_fork()---->_do_fork()---->copy_process()---->sched_fork()

　　p->sched_class->task_fork　會調用到fair_sched_class的task_fork_fair成員：

int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
	p->state = TASK_RUNNING;

	if (dl_prio(p->prio)) {
		put_cpu();
		return -EAGAIN;
	} else if (rt_prio(p->prio)) {
		p->sched_class = &rt_sched_class;
	} else {
		p->sched_class = &fair_sched_class;
	}

	if (p->sched_class->task_fork)
		p->sched_class->task_fork(p);

......
}

static void task_fork_fair(struct task_struct *p)
{
	cfs_rq = task_cfs_rq(current);
	curr = cfs_rq->curr;

	__set_task_cpu(p, this_cpu);//把目前CPU綁定到該程序中

	update_curr(cfs_rq);//更新目前正在運作的排程實體的運作時間資訊

	if (curr)
		se->vruntime = curr->vruntime;//初始化目前建立的新程序的虛拟時間
	place_entity(cfs_rq, se, 1);

	se->vruntime -= cfs_rq->min_vruntime;
}

task_fork_fair函數所完成的工作可以用下圖清晰的表明，主要包含計算并更新虛拟時間vruntime：

程式排程四　linux CFS排程器

（2）喚醒程序

　　wake_up_new_task -> activate_task ->enqueue_task ->p->sched_class->enqueue_task(rq, p, flags);

　　這裡會調動到enqueue_task_fair成員：

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	for_each_sched_entity(se) {
		if (se->on_rq)//如果在就緒隊列中，不需要再次添加則break
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, flags);//入隊操作
   }
......
}

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    //如果是建立的程序再添加上cfs_rq->min_vruntime
	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
		se->vruntime += cfs_rq->min_vruntime;

    //更新目前程序的vruntime和CFS就緒隊列的min_vruntime
	update_curr(cfs_rq);

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);//把該排程實體加入到CFS就緒隊列的紅黑樹中
	se->on_rq = 1;
......
}

（4）排程程序：

　　__schedule()是排程的核心函數，其作用是讓排程器選擇和切換到一個合适的程序運作，核心代碼中

對于schedule的排程場景以及描叙的非常詳細：

/*
 * __schedule() is the main scheduler function.
 *
 * The main means of driving the scheduler and thus entering this function are:
 *
 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
 *
 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
 *      paths. For example, see arch/x86/entry_64.S.
 *      To drive preemption between tasks, the scheduler sets the flag in timer
 *      interrupt handler scheduler_tick().
 *
 *   3. Wakeups don't really cause entry into schedule(). They add a
 *      task to the run-queue and that's it.
 *
 *      Now, if the new task added to the run-queue preempts the current
 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
 *      called on the nearest possible occasion:
 *
 *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
 *
 *         - in syscall or exception context, at the next outmost
 *           preempt_enable(). (this might be as soon as the wake_up()'s
 *           spin_unlock()!)
 *         - in IRQ context, return from interrupt-handler to
 *           preemptible context
 *
 *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
 *         then at the next:
 *
 *          - cond_resched() call
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
 *
 * WARNING: must be called with preemption disabled!
 */

static void __sched notrace __schedule(bool preempt)
{
	next = pick_next_task(rq, prev);//選擇下個合适的程序開始運作

	clear_tsk_need_resched(prev);//清除TIF_NEED_RESCHED flag

	rq = context_switch(rq, prev, next); //切換程序
}

pick_next_task會調用到CFS的成員：fair_sched_class.pick_next_task(rq, prev)選擇一個合适的程序執行。

用下圖總結性的看下程序的切換流程：

程式排程四　linux CFS排程器

（5）程序的睡眠：

CFS 排程管理的程序再說睡眠時會調用到 enqueue_task_fair：

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, flags);//将排程實體se從對應的就緒隊列cfs_rq上删除
    }
}

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);//更新虛拟時間vruntime

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);//從紅黑樹節點删除排程實體
	se->on_rq = 0;//更新on_rq成員

	account_entity_dequeue(cfs_rq, se);

	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;//減去目前就緒隊列對應的最小虛拟時間
}

作者：frank_zyp

您的支援是對部落客最大的鼓勵，感謝您的認真閱讀。

本文無所謂版權，歡迎轉載。

程序排程四 linux CFS排程器

程序排程四　linux CFS排程器