Linux程序排程(2)：CFS排程操作

Linux主要實作了兩大類排程算法，CFS(完全公平排程算法）和實時排程算法。宏SCHED_NOMAL和SCHED_BATCH主要用于CFS排程，而SCHED_FIFO和SCHED_RR主要用于實時排程。這幾個宏的定義可以在include/linux/sched.h中找到。檔案kernel/sched.c包含了核心排程器及相關系統調用的實作。排程的核心函數為sched.c中的schedule()，schedule函數封裝了核心排程的架構。細節實作上調用具體的排程算法類中的函數實作，如kernel/sched_fair.c或kernel/sched_rt.c中的實作。

1、時鐘tick中斷的處理

在CFS中，當産生時鐘tick中斷時，sched.c中scheduler_tick()函數會被時鐘中斷（定時器timer的代碼）直接調用，我們調用它則是在禁用中斷時。注意在fork的代碼中，當修改父程序的時間片時，也會導緻sched_tick的調用。sched_tick函數首先更新排程資訊，然後調整目前程序在紅黑樹中的位置。調整完成後如果發現目前程序不再是最左邊的葉子，就标記need_resched标志，中斷傳回時就會調用scheduler()完成程序切換，否則目前程序繼續占用CPU。注意這與以前的排程器不同，以前是tick中斷導緻時間片遞減，當時間片被用完時才觸發優先級調整并重新排程。sched_tick函數的代碼如下：

void scheduler_tick(void)
{
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	struct task_struct *curr = rq->curr;

	sched_clock_tick();

	spin_lock(&rq->lock);
	update_rq_clock(rq);
	update_cpu_load(rq);
	curr->sched_class->task_tick(rq, curr, 0);
	spin_unlock(&rq->lock);

	perf_event_task_tick(curr, cpu);

#ifdef CONFIG_SMP
	rq->idle_at_tick = idle_cpu(cpu);
	trigger_load_balance(rq, cpu);
#endif
}

它先擷取目前CPU上的運作隊列中的目前運作程序，更新runqueue級變量clock，然後通過sched_class中的接口名task_tick，調用CFS的tick處理函數task_tick_fair()，以處理時鐘中斷。我們看kernel/sched_fair.c中的CFS算法實作。具體的排程類如下：

static const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_fair,

	.load_balance		= load_balance_fair,
	.move_one_task		= move_one_task_fair,
	.rq_online		= rq_online_fair,
	.rq_offline		= rq_offline_fair,

	.task_waking		= task_waking_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_move_group	= task_move_group_fair,
#endif
};

task_tick_fair函數用于輪詢排程類的中一個程序。實作如下：

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &curr->se;

	for_each_sched_entity(se) {  /* 考慮了組排程 */
		cfs_rq = cfs_rq_of(se);
		entity_tick(cfs_rq, se, queued);
	}
}

該函數擷取各層的排程實體，對每個排程實體擷取CFS運作隊列，調用entity_tick程序進行處理。kernel/sched_fair.c中的函數entity_tick源代碼如下：

static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

#ifdef CONFIG_SCHED_HRTICK
	/*
	 * queued ticks are scheduled to match the slice, so don't bother
	 * validating it and just reschedule.
	 */
	if (queued) {
		resched_task(rq_of(cfs_rq)->curr);
		return;
	}
	/*
	 * don't let the period tick interfere with the hrtick preemption
	 */
	if (!sched_feat(DOUBLE_TICK) &&
			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
		return;
#endif

	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
		check_preempt_tick(cfs_rq, curr);
}

該函數用kernel/sched_fair.c:update_curr()更新目前程序的運作時統計資訊，然後調用kernel/sched_fair.c:check_preempt_tick()，檢測是否需要重新排程，用下一個程序來搶占目前程序。update_curr()實作記賬功能,由系統定時器周期調用，實作如下：

static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
	      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;

	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));

	curr->sum_exec_runtime += delta_exec; /* 總運作時間更新 */
	schedstat_add(cfs_rq, exec_clock, delta_exec); /* 更新cfs_rq的exec_clock */
    /* 用優先級和delta_exec來計算weighted，以用于更新vruntime */
	delta_exec_weighted = calc_delta_fair(delta_exec, curr);

	curr->vruntime += delta_exec_weighted; /* 更新目前程序的vruntime */
	update_min_vruntime(cfs_rq);
}

static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_of(cfs_rq)->clock_task;  /* now計時器 */
	unsigned long delta_exec;

	if (unlikely(!curr))
		return;

	/*
	 * 擷取從最後一次修改負載後目前程序所占用的運作總時間，
	 * 即計算目前程序的執行時間
	 */
	delta_exec = (unsigned long)(now - curr->exec_start);
	if (!delta_exec)  /* 如果本次沒有執行過，不用重新更新了 */
		return;
	/* 根據目前可運作程序總數對運作時間進行權重計算 */
	__update_curr(cfs_rq, curr, delta_exec);
	curr->exec_start = now;  /* 将exec_start屬性置為now */

	if (entity_is_task(curr)) {  /* 下面為關于組排程的 */
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cpuacct_charge(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}
}

這裡delta_exec獲得從最後一次修改負載後目前程序所占用的運作總時間，即計算目前程序的執行時間。然後調用__update_curr()更新程序的vruntime。更新前需要計算weighted，這由sched_fair.c:calc_delta_fair()實作，如下：

static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);

	return delta;
}

在calc_delta_fair中，如果程序的優先級為0,那麼就是傳回delta，如果不為0,就要調用kernel/sched.c中的calc_delta_mine對delta值進行修正，如下：

#if BITS_PER_LONG == 32
# define WMULT_CONST	(~0UL)
#else
# define WMULT_CONST	(1UL << 32)
#endif

#define WMULT_SHIFT	32

/*
 * Shift right and round:
 */
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

/*
 * delta *= weight / lw
 */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
		struct load_weight *lw)
{
	u64 tmp;

	if (!lw->inv_weight) {
		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
			lw->inv_weight = 1;
		else
			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
				/ (lw->weight+1);
	}

	tmp = (u64)delta_exec * weight;
	/*
	 * Check whether we'd overflow the 64-bit multiplication:
	 */
	if (unlikely(tmp > WMULT_CONST))
		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
			WMULT_SHIFT/2);
	else
		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

CFS允許每個程序運作一段時間、循環輪轉、選擇運作最少的程序作為下一個運作程序，而不再采用配置設定給每個程序時間片的做法了，CFS在所有可運作程序總數基礎上計算出一個程序應該運作多久，而不是依靠nice值來計算時間片。nice值在CFS中被作為程序獲得的處理器運作比的權重，越高的nice值（越低的優先級）程序獲得更低的處理器使用權重，這是相對預設nice值程序的程序而言的；相反，更低的nice值（越高的優先級）的程序獲得更高的處理器使用權重。

這裡delta的計算有如下關系: delta=delta* NICE_0_LOAD/se->load。se->load值是怎麼來的呢？可以跟蹤sys_nice()，就可以發現se->load其實就是表示nice對應的load值，nice越低，值越大。據此就可以得到一個結論，在執行相同時間的條件下(delta相同)，高優先的程序計算出來的delta值會比低優先級的程序計算出來的低。應此高優先的程序就會位于rb_tree的左邊，在下次排程的時候就會優先排程。

回到entity_tick，我們看check_preempt_tick()的實作，它用來檢測是否需要重新排程下一個程序。如下：

static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
	unsigned long ideal_runtime, delta_exec;

	ideal_runtime = sched_slice(cfs_rq, curr);
	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	if (delta_exec > ideal_runtime) {
		resched_task(rq_of(cfs_rq)->curr);
		/*
		 * The current task ran long enough, ensure it doesn't get
		 * re-elected due to buddy favours.
		 */
		clear_buddies(cfs_rq, curr);
		return;
	}

	/*
	 * Ensure that a task that missed wakeup preemption by a
	 * narrow margin doesn't have to wait for a full slice.
	 * This also mitigates buddy induced latencies under load.
	 */
	if (!sched_feat(WAKEUP_PREEMPT))
		return;

	if (delta_exec < sysctl_sched_min_granularity)
		return;

	if (cfs_rq->nr_running > 1) { /* 用于組排程 */
		struct sched_entity *se = __pick_next_entity(cfs_rq);
		s64 delta = curr->vruntime - se->vruntime;

		if (delta > ideal_runtime)
			resched_task(rq_of(cfs_rq)->curr);
	}
}

該函數先擷取目前程序的理想運作時間，如果目前執行時間超過理想時間，調用kernel/sched.c:resched_task()設定need_resched标志，完成設定的函數為resched_task()--->set_tsk_need_resched(p)，表示需要重新排程程序。

從上面分析可以看出，通過調用鍊sched_tick()--->task_tick_fair()--->entity_tick()--->［update_curr()--->__update_curr()--->calc_delta_fair()--->calc_delta_mine()] 和 [check_preempt_tick()--->resched_task()]，最終會更新排程資訊，設定need_resched排程标志。當中斷傳回時，就會調用schedule()進行搶占式排程。

2、CFS排程操作

在sched_fair.c中，CFS實作了用紅黑樹對運作隊列進行管理的相關操作。

（1）程序插入enqueue_task_fair：更新排程資訊，調用enqueue_entity()--->__enqueue_entity()将排程實體插入到紅黑樹中。它會在nr_running遞增之前被調用。插入時，會找到右邊的空間并進行插入，然後緩存最左邊的節點。對于組排程，會對組中的所有程序進行操作。如下：

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	s64 key = entity_key(cfs_rq, se); /* key為被插入程序的vruntime */
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (key < entity_key(cfs_rq, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

可見CFS的運作隊列布局是放在紅黑樹裡面的，而這顆紅黑樹的排序方式是按照運作實體的vruntime來的。vruntime的計算方式在上面已經做了分析。在前面“Linux程序管理”的幾節介紹中，我們可以看到fork()在建立子程序時最後就會調用enqueue_task_fair()，将新建立的程序插入到紅黑樹中。

（2）程序選擇pick_next_task_fair：CFS排程算法的核心是選擇具有最小vruntine的任務。運作隊列采用紅黑樹方式存放，其中節點的鍵值便是可運作程序的虛拟運作時間。CFS排程器選取待運作的下一個程序，是所有程序中vruntime最小的那個，他對應的便是在樹中最左側的葉子節點。實作選擇的函數為 pick_next_task_fair。如下：

static struct task_struct *pick_next_task_fair(struct rq *rq)
{
	struct task_struct *p;
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;

	if (unlikely(!cfs_rq->nr_running))
		return NULL;

	do {   /* 此循環為了考慮組排程 */
		se = pick_next_entity(cfs_rq);
		set_next_entity(cfs_rq, se);  /* 設定為目前運作程序 */
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	p = task_of(se);
	hrtick_start_fair(rq, p);

	return p;
}

該函數調用pick_next_entity()--->__pick_next_entity()完成擷取下一個程序的工作，這個函數如下：

static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct rb_node *left = cfs_rq->rb_leftmost;

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
}

該函數并不會周遊紅黑樹來找到最左葉子節點（是所有程序中vruntime最小的那個），因為該值已經緩存在rb_leftmost字段中。它通過rb_entry函數傳回這個緩存的節點程序。完成實質工作的調用為include/linux/rbtree.h:rb_entry()--->include/linux/kernel.h:container_of()，這是一個宏定義。

（3）程序删除dequeue_task_fair：從紅黑樹中删除程序，并更新排程資訊。它會在nr_running遞減之前被調用。完成實質工作的函數為dequeue_entity()--->__dequeue_entity()。如下：

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);
		cfs_rq->rb_leftmost = next_node;
	}

	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

該函數會删除目前程序，并從紅黑樹中選出下一個具體最小vruntime值的節點，作為新的最左邊節點緩存起來。

Linux程序排程(2)：CFS排程操作

繼續閱讀

Ubuntu14.04 LTS下安裝mongodb

筆試面試題目：滑動視窗(二)

httpd服務的部署、啟動、配置和簡單優化一、部署二、啟動三、配置檔案

配置網頁内容通路

手動安裝Intel network I217-LM網卡的Linux驅動

禁止ubuntu系統彈出報錯界面

資料結構與算法（27）——排序（二）

Ubuntu Linux下Apache的配置檔案

Dijkstra--簡易版（最短路徑）

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

samba伺服器的功能

【Linux】UDP廣播封包接收速率問題

Linux裝置模型（中）之上層容器

PowerPC平台 Linux移植三

hdu7108哈希