天天看點

程序排程四 linux CFS排程器

一、CFS排程器結構:

思想:

  理想狀态下每個程序都能獲得相同的時間片,并且同時運作在CPU上,但實際上一個CPU同一

時刻運作的程序隻能有一個。也就是說,當一個程序占用CPU時,其他程序就必須等待。CFS為了實

現公平,必須懲罰目前正在運作的程序,以使那些正在等待的程序下次被排程。

1、CFS運作隊列:

  每個CPU都有自己的運作隊列,對應不同的排程器也有自己的運作隊列,管理CFS排程的的隊列為cfs_rq:

struct cfs_rq {
	struct load_weight load;
	unsigned int nr_running, h_nr_running;

	u64 min_vruntime;
}
           
成員 描叙
load

struct load_weight

用來記錄權重資訊
nr_runing 就緒隊列上排程實體的個數
min_vruntime 就緒隊列上所有排程實體的最小虛拟時間

2、CFS排程實體:

  排程的實體用于描叙排程的具體資訊,包含排程的權重、虛拟運作時間等:

struct sched_entity {
	struct load_weight	load;		/* for load-balancing */
	struct rb_node		run_node;
	struct list_head	group_node;
	unsigned int		on_rq;

	u64			exec_start;
	u64			sum_exec_runtime;
	u64			vruntime;
	u64			prev_sum_exec_runtime;

	u64			nr_migrations;
	struct sched_entity	*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq		*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq		*my_q;
}
           
成員 描叙
load 權重資訊
run_node CFS排程器的每個就緒隊列維護了一顆紅黑樹,run_node是挂載點
on_rq 排程實體se加入就緒隊列後,on_rq置1。從就緒隊列删除後,on_rq置0
vruntime 排程實體已經運作的虛拟時間總和

3、CFS排程類:

  排程類是程序排程的具體實作,是程序排程的具體實作,包含入隊、出隊、搶占判斷、下一個運作

程序的選擇等:

const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
	.yield_to_task		= yield_to_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_from		= switched_from_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

	.update_curr		= update_curr_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_move_group	= task_move_group_fair,
#endif
};
           

(1)CFS程序的建立:

  do_fork()---->_do_fork()---->copy_process()---->sched_fork()

  p->sched_class->task_fork 會調用到fair_sched_class的task_fork_fair成員:

int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
	p->state = TASK_RUNNING;

	if (dl_prio(p->prio)) {
		put_cpu();
		return -EAGAIN;
	} else if (rt_prio(p->prio)) {
		p->sched_class = &rt_sched_class;
	} else {
		p->sched_class = &fair_sched_class;
	}

	if (p->sched_class->task_fork)
		p->sched_class->task_fork(p);

......
}
           
static void task_fork_fair(struct task_struct *p)
{
	cfs_rq = task_cfs_rq(current);
	curr = cfs_rq->curr;

	__set_task_cpu(p, this_cpu);//把目前CPU綁定到該程序中

	update_curr(cfs_rq);//更新目前正在運作的排程實體的運作時間資訊

	if (curr)
		se->vruntime = curr->vruntime;//初始化目前建立的新程序的虛拟時間
	place_entity(cfs_rq, se, 1);

	se->vruntime -= cfs_rq->min_vruntime;
}
           

task_fork_fair函數所完成的工作可以用下圖清晰的表明,主要包含計算并更新虛拟時間vruntime:

程式排程四 linux CFS排程器

(2)喚醒程序

  wake_up_new_task -> activate_task ->enqueue_task ->p->sched_class->enqueue_task(rq, p, flags);

  這裡會調動到enqueue_task_fair成員:

/*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	for_each_sched_entity(se) {
		if (se->on_rq)//如果在就緒隊列中,不需要再次添加則break
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, flags);//入隊操作
   }
......
}
           
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
    //如果是建立的程序再添加上cfs_rq->min_vruntime
	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
		se->vruntime += cfs_rq->min_vruntime;

    //更新目前程序的vruntime和CFS就緒隊列的min_vruntime
	update_curr(cfs_rq);

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);//把該排程實體加入到CFS就緒隊列的紅黑樹中
	se->on_rq = 1;
......
}
           

(4)排程程序:

  __schedule()是排程的核心函數,其作用是讓排程器選擇和切換到一個合适的程序運作,核心代碼中

對于schedule的排程場景以及描叙的非常詳細:

/*
 * __schedule() is the main scheduler function.
 *
 * The main means of driving the scheduler and thus entering this function are:
 *
 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
 *
 *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
 *      paths. For example, see arch/x86/entry_64.S.
 *      To drive preemption between tasks, the scheduler sets the flag in timer
 *      interrupt handler scheduler_tick().
 *
 *   3. Wakeups don't really cause entry into schedule(). They add a
 *      task to the run-queue and that's it.
 *
 *      Now, if the new task added to the run-queue preempts the current
 *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
 *      called on the nearest possible occasion:
 *
 *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
 *
 *         - in syscall or exception context, at the next outmost
 *           preempt_enable(). (this might be as soon as the wake_up()'s
 *           spin_unlock()!)
 *         - in IRQ context, return from interrupt-handler to
 *           preemptible context
 *
 *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
 *         then at the next:
 *
 *          - cond_resched() call
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
 *
 * WARNING: must be called with preemption disabled!
 */
           
static void __sched notrace __schedule(bool preempt)
{
	next = pick_next_task(rq, prev);//選擇下個合适的程序開始運作

	clear_tsk_need_resched(prev);//清除TIF_NEED_RESCHED flag

	rq = context_switch(rq, prev, next); //切換程序
}
           

pick_next_task會調用到CFS的成員:fair_sched_class.pick_next_task(rq, prev)選擇一個合适的程序執行。

用下圖總結性的看下程序的切換流程:

程式排程四 linux CFS排程器

(5)程序的睡眠:

CFS 排程管理的程序再說睡眠時會調用到 enqueue_task_fair:

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		dequeue_entity(cfs_rq, se, flags);//将排程實體se從對應的就緒隊列cfs_rq上删除
    }
}
           
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);//更新虛拟時間vruntime

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);//從紅黑樹節點删除排程實體
	se->on_rq = 0;//更新on_rq成員

	account_entity_dequeue(cfs_rq, se);

	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;//減去目前就緒隊列對應的最小虛拟時間
}
           

作者:frank_zyp

您的支援是對部落客最大的鼓勵,感謝您的認真閱讀。

本文無所謂版權,歡迎轉載。