一、CFS排程器結構:
思想:
理想狀态下每個程序都能獲得相同的時間片,并且同時運作在CPU上,但實際上一個CPU同一
時刻運作的程序隻能有一個。也就是說,當一個程序占用CPU時,其他程序就必須等待。CFS為了實
現公平,必須懲罰目前正在運作的程序,以使那些正在等待的程序下次被排程。
1、CFS運作隊列:
每個CPU都有自己的運作隊列,對應不同的排程器也有自己的運作隊列,管理CFS排程的的隊列為cfs_rq:
struct cfs_rq {
struct load_weight load;
unsigned int nr_running, h_nr_running;
u64 min_vruntime;
}
成員 | 描叙 |
load | 用來記錄權重資訊 |
nr_runing | 就緒隊列上排程實體的個數 |
min_vruntime | 就緒隊列上所有排程實體的最小虛拟時間 |
2、CFS排程實體:
排程的實體用于描叙排程的具體資訊,包含排程的權重、虛拟運作時間等:
struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 nr_migrations;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
}
成員 | 描叙 |
load | 權重資訊 |
run_node | CFS排程器的每個就緒隊列維護了一顆紅黑樹,run_node是挂載點 |
on_rq | 排程實體se加入就緒隊列後,on_rq置1。從就緒隊列删除後,on_rq置0 |
vruntime | 排程實體已經運作的虛拟時間總和 |
3、CFS排程類:
排程類是程序排程的具體實作,是程序排程的具體實作,包含入隊、出隊、搶占判斷、下一個運作
程序的選擇等:
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
.yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
.put_prev_task = put_prev_task_fair,
.set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
.switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
};
(1)CFS程序的建立:
do_fork()---->_do_fork()---->copy_process()---->sched_fork()
p->sched_class->task_fork 會調用到fair_sched_class的task_fork_fair成員:
int sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->state = TASK_RUNNING;
if (dl_prio(p->prio)) {
put_cpu();
return -EAGAIN;
} else if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
} else {
p->sched_class = &fair_sched_class;
}
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
......
}
static void task_fork_fair(struct task_struct *p)
{
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
__set_task_cpu(p, this_cpu);//把目前CPU綁定到該程序中
update_curr(cfs_rq);//更新目前正在運作的排程實體的運作時間資訊
if (curr)
se->vruntime = curr->vruntime;//初始化目前建立的新程序的虛拟時間
place_entity(cfs_rq, se, 1);
se->vruntime -= cfs_rq->min_vruntime;
}
task_fork_fair函數所完成的工作可以用下圖清晰的表明,主要包含計算并更新虛拟時間vruntime:
(2)喚醒程序
wake_up_new_task -> activate_task ->enqueue_task ->p->sched_class->enqueue_task(rq, p, flags);
這裡會調動到enqueue_task_fair成員:
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
* then put the task into the rbtree:
*/
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
for_each_sched_entity(se) {
if (se->on_rq)//如果在就緒隊列中,不需要再次添加則break
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);//入隊操作
}
......
}
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
//如果是建立的程序再添加上cfs_rq->min_vruntime
if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
//更新目前程序的vruntime和CFS就緒隊列的min_vruntime
update_curr(cfs_rq);
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);//把該排程實體加入到CFS就緒隊列的紅黑樹中
se->on_rq = 1;
......
}
(4)排程程序:
__schedule()是排程的核心函數,其作用是讓排程器選擇和切換到一個合适的程序運作,核心代碼中
對于schedule的排程場景以及描叙的非常詳細:
/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
*
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
*
* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
* paths. For example, see arch/x86/entry_64.S.
* To drive preemption between tasks, the scheduler sets the flag in timer
* interrupt handler scheduler_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
*
* Now, if the new task added to the run-queue preempts the current
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
* - If the kernel is preemptible (CONFIG_PREEMPT=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
* spin_unlock()!)
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
* - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
* then at the next:
*
* - cond_resched() call
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
* WARNING: must be called with preemption disabled!
*/
static void __sched notrace __schedule(bool preempt)
{
next = pick_next_task(rq, prev);//選擇下個合适的程序開始運作
clear_tsk_need_resched(prev);//清除TIF_NEED_RESCHED flag
rq = context_switch(rq, prev, next); //切換程序
}
pick_next_task會調用到CFS的成員:fair_sched_class.pick_next_task(rq, prev)選擇一個合适的程序執行。
用下圖總結性的看下程序的切換流程:
(5)程序的睡眠:
CFS 排程管理的程序再說睡眠時會調用到 enqueue_task_fair:
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);//将排程實體se從對應的就緒隊列cfs_rq上删除
}
}
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);//更新虛拟時間vruntime
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);//從紅黑樹節點删除排程實體
se->on_rq = 0;//更新on_rq成員
account_entity_dequeue(cfs_rq, se);
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;//減去目前就緒隊列對應的最小虛拟時間
}
作者:frank_zyp
您的支援是對部落客最大的鼓勵,感謝您的認真閱讀。
本文無所謂版權,歡迎轉載。