CFS公平调度算法(源码)
CFS公平调度算法
版本:linux4.2
背景
进程分类:
I/O消耗型:运行时间很短,等待I/O时阻塞
处理器消耗型:执行代码为主,I/O需求较少
调度策略就是要在这两类进程中:进程迅速响应(响应时间短)和最大系统利用率(高吞吐量)寻找平衡
进程优先级:
最基本的调度算法,根据进程价值和其对处理器时间的需求来对进程分级
时间片概念、nice值(越高优先级越低)
CFS算法
进程调度的时机
(1)进程状态转换:进程终止、进程睡眠
(2)当前进程的“时间片”用完
(3)主动让出处理器,用户调用sleep()或者内核调用schedule()
(4)从中断、系统调用或异常返回时
CFS抛弃了时间片的概念,而是分配给进程一个处理器使用比重
其出发点在于:进程调度的效果应如同系统具备一个理想中的完美多任务处理器,每个进程获得1/n的处理器时间。理想情况下,完美的多任务处理器模型应该是两个进程同时运行,并且各自使用处理器一半的能力
而实际上是做不到的,因此CFS的做法是允许每个进程运行一段时间,循环轮转,选择运行最少的进程作为下一个运行进程,不再采用分配给每个进程时间片的做法,CFS在所有可运行进程总数基础上计算出一个进程应该运行多久,而不是依靠nice值来计算时间片。
vruntime:进程的运行时间
通过优先级和系统负载等加权过的时间,不是物理时钟时间
1、基本数据结构
每个进程描述符都有一个调度实体se
include/linux/sched.h
struct task_struct {volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */void *stack;atomic_t usage;unsigned int flags; /* per process flags, defined below */unsigned int ptrace;#ifdef CONFIG_SMPstruct llist_node wake_entry;int on_cpu;struct task_struct *last_wakee;unsigned long wakee_flips;unsigned long wakee_flip_decay_ts;int wake_cpu;
#endifint on_rq;int prio, static_prio, normal_prio;unsigned int rt_priority;const struct sched_class *sched_class;struct sched_entity se;struct sched_rt_entity rt;……
se中即保存着进程的虚拟运行时间,其记录一个程序到底运行了多长时间以及应该再运行多久
struct sched_entity {struct load_weight load; /* for load-balancing */struct rb_node run_node;struct list_head group_node;unsigned int on_rq;u64 exec_start;u64 sum_exec_runtime;u64 vruntime;u64 prev_sum_exec_runtime;u64 nr_migrations;#ifdef CONFIG_SCHEDSTATSstruct sched_statistics statistics;
#endif#ifdef CONFIG_FAIR_GROUP_SCHEDint depth;struct sched_entity *parent;/* rq on which this entity is (to be) queued: */struct cfs_rq *cfs_rq;/* rq "owned" by this entity/group: */struct cfs_rq *my_q;
#endif#ifdef CONFIG_SMP/* Per-entity load-tracking */struct sched_avg avg;
#endif
};
CFS为每个CPU维护一个调度队列cfs_rq
D:\学习\linux-4.2\kernel\sched\sched.h
2、vruntime值更新
时钟中断产生时,会调用tick_periodic()->update_process_times()->scheduler_tick()
D:\学习\linux-4.2\kernel\time\tick-common.c
static void tick_periodic(int cpu)
{if (tick_do_timer_cpu == cpu) {write_seqlock(&jiffies_lock);/* Keep track of the next tick event */tick_next_period = ktime_add(tick_next_period, tick_period);do_timer(1);write_sequnlock(&jiffies_lock);update_wall_time();}update_process_times(user_mode(get_irq_regs()));profile_tick(CPU_PROFILING);
}D:\学习\linux-4.2\kernel\time\timer.c
/** Called from the timer interrupt handler to charge one tick to the current* process. user_tick is 1 if the tick is user time, 0 for system.*/
void update_process_times(int user_tick)
{struct task_struct *p = current;/* Note: this timer irq context must be accounted for as well. */account_process_tick(p, user_tick);run_local_timers();rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORKif (in_irq())irq_work_tick();
#endifscheduler_tick();run_posix_cpu_timers(p);
}D:\学习\linux-4.2\kernel\sched\core.c
/** This function gets called by the timer code, with HZ frequency.* We call it with interrupts disabled.*/
void scheduler_tick(void)
{int cpu = smp_processor_id();struct rq *rq = cpu_rq(cpu);struct task_struct *curr = rq->curr;sched_clock_tick();raw_spin_lock(&rq->lock);update_rq_clock(rq);curr->sched_class->task_tick(rq, curr, 0);//更新进程vruntimeupdate_cpu_load_active(rq);calc_global_load_tick(rq);raw_spin_unlock(&rq->lock);perf_event_task_tick();#ifdef CONFIG_SMPrq->idle_balance = idle_cpu(cpu);trigger_load_balance(rq);
#endifrq_last_tick_reset(rq);
}D:\学习\linux-4.2\kernel\sched\fair.c
.task_tick = task_tick_fair,
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{struct cfs_rq *cfs_rq;struct sched_entity *se = &curr->se;//更新每个进程的vruntimefor_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);entity_tick(cfs_rq, se, queued);}if (numabalancing_enabled)task_tick_numa(rq, curr);update_rq_runnable_avg(rq, 1);
}
cfs_rq:cfs调度器类的运行队列
D:\学习\linux-4.2\kernel\sched\fair.c
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{/** Update run-time statistics of the 'current'.*/update_curr(cfs_rq);/** Ensure that runnable average is periodically updated.*/update_entity_load_avg(curr, 1);update_cfs_rq_blocked_load(cfs_rq, 1);update_cfs_shares(cfs_rq);#ifdef CONFIG_SCHED_HRTICK/** queued ticks are scheduled to match the slice, so don't bother* validating it and just reschedule.*/if (queued) {resched_curr(rq_of(cfs_rq));return;}/** don't let the period tick interfere with the hrtick preemption*/if (!sched_feat(DOUBLE_TICK) &&hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))return;
#endifif (cfs_rq->nr_running > 1)check_preempt_tick(cfs_rq, curr);//检查当前进程是否需要调度
}/** Update the current task's runtime statistics.*/
static void update_curr(struct cfs_rq *cfs_rq)
{struct sched_entity *curr = cfs_rq->curr;u64 now = rq_clock_task(rq_of(cfs_rq));u64 delta_exec;if (unlikely(!curr))return;delta_exec = now - curr->exec_start;if (unlikely((s64)delta_exec <= 0))return;curr->exec_start = now;schedstat_set(curr->statistics.exec_max,max(delta_exec, curr->statistics.exec_max));curr->sum_exec_runtime += delta_exec;schedstat_add(cfs_rq, exec_clock, delta_exec);curr->vruntime += calc_delta_fair(delta_exec, curr);update_min_vruntime(cfs_rq);if (entity_is_task(curr)) {struct task_struct *curtask = task_of(curr);trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);cpuacct_charge(curtask, delta_exec);account_group_exec_runtime(curtask, delta_exec);}account_cfs_rq_runtime(cfs_rq, delta_exec);
}/** delta /= w*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{if (unlikely(se->load.weight != NICE_0_LOAD))delta = __calc_delta(delta, NICE_0_LOAD, &se->load);return delta;
}/** delta_exec * weight / lw.weight* OR* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT** Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case* we're guaranteed shift stays positive because inv_weight is guaranteed to* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.** Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus* weight/lw.weight <= 1, and therefore our shift will also be positive.*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{u64 fact = scale_load_down(weight);int shift = WMULT_SHIFT;unsigned long w;if (likely(lw->inv_weight))return;w = scale_load_down(lw->weight);if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))lw->inv_weight = 1;else if (unlikely(!w))lw->inv_weight = WMULT_CONST;elselw->inv_weight = WMULT_CONST / w;if (unlikely(fact >> 32)) {while (fact >> 32) {fact >>= 1;shift--;}}/* hint to use a 32x32->64 mul */fact = (u64)(u32)fact * lw->inv_weight;while (fact >> 32) {fact >>= 1;shift--;}return mul_u64_u32_shr(delta_exec, fact, shift);
}/** Nice levels are multiplicative, with a gentle 10% change for every* nice level changed. I.e. when a CPU-bound task goes from nice 0 to* nice 1, it will get ~10% less CPU time than another CPU-bound task* that remained on nice 0.** The "10% effect" is relative and cumulative: from _any_ nice level,* if you go up 1 level, it's -10% CPU usage, if you go down 1 level* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.* If a task goes up by ~10% and another task goes down by ~10% then* the relative distance between them is ~25%.)*/
static const int prio_to_weight[40] = {/* -20 */ 88761, 71755, 56483, 46273, 36291,/* -15 */ 29154, 23254, 18705, 14949, 11916,/* -10 */ 9548, 7620, 6100, 4904, 3906,/* -5 */ 3121, 2501, 1991, 1586, 1277,/* 0 */ 1024, 820, 655, 526, 423,/* 5 */ 335, 272, 215, 172, 137,/* 10 */ 110, 87, 70, 56, 45,/* 15 */ 36, 29, 23, 18, 15,
};/** Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.** In cases where the weight does not change often, we can use the* precalculated inverse to speed up arithmetics by turning divisions* into multiplications:*/
static const u32 prio_to_wmult[40] = {/* -20 */ 48388, 59856, 76040, 92818, 118348,/* -15 */ 147320, 184698, 229616, 287308, 360437,/* -10 */ 449829, 563644, 704093, 875809, 1099582,/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
每差一级cpu占用时间差10%左右,
有两个nice值为0的进程同时占用cpu,那么它们应该每人占50%的cpu,如果将其中一个进程的nice值调整为1的话,那么此时应保证优先级高的进程比低的多占用10%的cpu,就是nice值为0的占55%,nice值为1的占45%。那么它们占用cpu时间的比例为55:45。这个值的比例约为1.25。就是说,相邻的两个nice值之间的cpu占用时间比例的差别应该大约为1.25。
vruntime=delta_exec*nice_0_weight/weight
其中delta_exec是实际时间,nice_0_weight表示nice为0权重值,weight表示该进程的权重值,可以通过prio_to_weight[]获取。
CPU调度是内核中会频繁进行处理的一个时间,于是上面的delta vruntime的运算会被频繁计算。除法运算会占用更多的cpu时间,所以内核编程中的一个原则就是,尽可能的不用除法。
vruntime=delta_execnice_0_weight2^32/weight>>32
其中232/weight可以用inv_weight来表示,其中inv_weight可以从prio_to_wmult[]中获取。
vruntime=delta_execnice_0_weightinv_weight>>32
检查是否需要进程调度
** Preempt the current task with a newly woken task if needed:*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{unsigned long ideal_runtime, delta_exec;struct sched_entity *se;s64 delta;ideal_runtime = sched_slice(cfs_rq, curr); //理论上的处理器运行时间片delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; //该进程本轮调度累计运行时间if (delta_exec > ideal_runtime) { //超过则标记调度标志,否则就继续执行resched_curr(rq_of(cfs_rq));/** The current task ran long enough, ensure it doesn't get* re-elected due to buddy favours.*/clear_buddies(cfs_rq, curr);return;}/** Ensure that a task that missed wakeup preemption by a* narrow margin doesn't have to wait for a full slice.* This also mitigates buddy induced latencies under load.*/if (delta_exec < sysctl_sched_min_granularity)return;se = __pick_first_entity(cfs_rq);delta = curr->vruntime - se->vruntime;if (delta < 0)return;if (delta > ideal_runtime)resched_curr(rq_of(cfs_rq));
}
3、如何选择下一个可执行进程
CFS选择具有最小的vruntime值的进程作为下一个可执行进程,使用红黑树来存储,键值就是vruntime,最左叶子节点作为下一个可执行进程,O(log(n)),为了更快,缓存最左叶子left_most
D:\学习\linux-4.2\kernel\sched\core.c
__schedule() is the main scheduler function.
static void __sched __schedule(void)
{next = pick_next_task(rq, prev);p = fair_sched_class.pick_next_task(rq, prev);pick_next_task_fair(struct rq *rq, struct task_struct *prev)se = pick_next_entity(cfs_rq, curr);struct sched_entity *left = __pick_first_entity(cfs_rq);put_prev_entity(cfs_rq, pse);//处理上一个进程,插入红黑树set_next_entity(cfs_rq, se); //设置下一个进程,移出红黑树rq = context_switch(rq, prev, next); /* unlocks the rq */ 调用 switch_to
}D:\学习\linux-4.2\kernel\sched\fair.c
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{struct rb_node *left = cfs_rq->rb_leftmost;if (!left)return NULL;return rb_entry(left, struct sched_entity, run_node);
}
处理上一个进程(当前进程)
put_prev_entity()
__enqueue_entity() 加入红黑树
/** Enqueue an entity into the rb-tree:*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;struct rb_node *parent = NULL;struct sched_entity *entry;int leftmost = 1;/** Find the right place in the rbtree:*/while (*link) {parent = *link;entry = rb_entry(parent, struct sched_entity, run_node);/** We dont care about collisions. Nodes with* the same key stay together.*/if (entity_before(se, entry)) {link = &parent->rb_left;} else {link = &parent->rb_right;leftmost = 0;}}/** Maintain a cache of leftmost tree entries (it is frequently* used):*/if (leftmost)cfs_rq->rb_leftmost = &se->run_node;rb_link_node(&se->run_node, parent, link);rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{if (cfs_rq->rb_leftmost == &se->run_node) {struct rb_node *next_node;next_node = rb_next(&se->run_node);cfs_rq->rb_leftmost = next_node;}rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
至此完成进程切换
4、新建进程如何确定vruntime以及加入红黑树
kernel\fork.c __do_fork()
copy_process()
kernel\sched\core.c sched_fork()
sched_class->task_fork()
kernel\sched\fair.c task_fork_fair()
static void task_fork_fair(struct task_struct *p)
{struct cfs_rq *cfs_rq;struct sched_entity *se = &p->se, *curr;int this_cpu = smp_processor_id();struct rq *rq = this_rq();unsigned long flags;raw_spin_lock_irqsave(&rq->lock, flags);update_rq_clock(rq);cfs_rq = task_cfs_rq(current);curr = cfs_rq->curr;/** Not only the cpu but also the task_group of the parent might have* been changed after parent->se.parent,cfs_rq were copied to* child->se.parent,cfs_rq. So call __set_task_cpu() to make those* of child point to valid ones.*/rcu_read_lock();__set_task_cpu(p, this_cpu);rcu_read_unlock();update_curr(cfs_rq);if (curr)se->vruntime = curr->vruntime;place_entity(cfs_rq, se, 1);if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {/** Upon rescheduling, sched_class::put_prev_task() will place* 'current' within the tree based on its new key value.*///交换swap(curr->vruntime, se->vruntime);resched_curr(rq);}se->vruntime -= cfs_rq->min_vruntime;raw_spin_unlock_irqrestore(&rq->lock, flags);
}static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{u64 vruntime = cfs_rq->min_vruntime;/** The 'current' period is already promised to the current tasks,* however the extra weight of the new task will slow them down a* little, place the new task so that it fits in the slot that* stays open at the end.*/if (initial && sched_feat(START_DEBIT))vruntime += sched_vslice(cfs_rq, se);/* sleeps up to a single latency don't count. */if (!initial) {unsigned long thresh = sysctl_sched_latency;/** Halve their sleep time's effect, to allow* for a gentler effect of sleepers:*/if (sched_feat(GENTLE_FAIR_SLEEPERS))thresh >>= 1;vruntime -= thresh;}/* ensure we never gain time by being placed backwards. */se->vruntime = max_vruntime(se->vruntime, vruntime);
}
子进程在创建时,vruntime初值首先被设置为min_vruntime;
然后,如果sched_features中设置了START_DEBIT位,vruntime会在min_vruntime的基础上再增大一些。
设置完子进程的vruntime之后,检查sched_child_runs_first参数,如果为1的话,就比较父进程和子进程的vruntime,若是父进程的vruntime更小,就对换父、子进程的vruntime,这样就保证了子进程会在父进程之前运行。
加入红黑树
wake_up_new_task()
kernel\sched\core.c activate_task()
enqueue_task()
kernel\sched\fair.c enqueue_task_fair()
5、睡眠进程
如果休眠进程的 vruntime 保持不变,而其他运行进程的 vruntime 一直在推进,那么等到休眠进程终于唤醒的时候,它的vruntime比别人小很多,会使它获得长时间抢占CPU的优势,其他进程就要饿死了
在休眠进程被唤醒时重新设置vruntime值,以min_vruntime值为基础,给予一定的补偿,但不能补偿太多。
当initial=1时,是新建进程
当initail=0时,是唤醒进程,vruntime要减去一个thresh,这样对睡眠进程做一个补偿
6、CPU切换
在多CPU的系统上,不同的CPU的负载不一样,有的CPU更忙一些,而每个CPU都有自己的运行队列,每个队列中的进程的vruntime也走得有快有慢,
当进程从一个CPU的运行队列中出来 (dequeue_entity) 的时候, 它的vruntime要减去队列的min_vruntime值;
而当进程加入另一个CPU的运行队列 ( enqueue_entiry) 时,它的vruntime要加上该队列的min_vruntime值。
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{/** Update run-time statistics of the 'current'.*/update_curr(cfs_rq);dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);update_stats_dequeue(cfs_rq, se);if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATSif (entity_is_task(se)) {struct task_struct *tsk = task_of(se);if (tsk->state & TASK_INTERRUPTIBLE)se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));if (tsk->state & TASK_UNINTERRUPTIBLE)se->statistics.block_start = rq_clock(rq_of(cfs_rq));}
#endif}clear_buddies(cfs_rq, se);if (se != cfs_rq->curr)__dequeue_entity(cfs_rq, se);se->on_rq = 0;account_entity_dequeue(cfs_rq, se);/** Normalize the entity after updating the min_vruntime because the* update can refer to the ->curr item and we need to reflect this* movement in our normalized position.*/if (!(flags & DEQUEUE_SLEEP))se->vruntime -= cfs_rq->min_vruntime;/* return excess runtime on last dequeue */return_cfs_rq_runtime(cfs_rq);update_min_vruntime(cfs_rq);update_cfs_shares(cfs_rq);
}
参考文档:
https://blog.csdn.net/hs794502825/article/details/10495161
https://www.jianshu.com/p/673c9e4817a8
https://www.cnblogs.com/arnoldlu/p/8465034.html
《Linux内核设计与实现》
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
