CFS公平调度算法（源码）

2023-10-24 10:51:06

CFS公平调度算法

版本：linux4.2

背景

进程分类：

I/O消耗型：运行时间很短，等待I/O时阻塞
处理器消耗型：执行代码为主，I/O需求较少
调度策略就是要在这两类进程中：进程迅速响应（响应时间短）和最大系统利用率（高吞吐量）寻找平衡

进程优先级：

最基本的调度算法，根据进程价值和其对处理器时间的需求来对进程分级
时间片概念、nice值（越高优先级越低）

CFS算法

进程调度的时机
（1）进程状态转换：进程终止、进程睡眠
（2）当前进程的“时间片”用完
（3）主动让出处理器，用户调用sleep()或者内核调用schedule()
（4）从中断、系统调用或异常返回时

CFS抛弃了时间片的概念，而是分配给进程一个处理器使用比重

其出发点在于：进程调度的效果应如同系统具备一个理想中的完美多任务处理器，每个进程获得1/n的处理器时间。理想情况下，完美的多任务处理器模型应该是两个进程同时运行，并且各自使用处理器一半的能力

而实际上是做不到的，因此CFS的做法是允许每个进程运行一段时间，循环轮转，选择运行最少的进程作为下一个运行进程，不再采用分配给每个进程时间片的做法，CFS在所有可运行进程总数基础上计算出一个进程应该运行多久，而不是依靠nice值来计算时间片。

vruntime：进程的运行时间

通过优先级和系统负载等加权过的时间，不是物理时钟时间

1、基本数据结构

每个进程描述符都有一个调度实体se

include/linux/sched.h	
struct task_struct {volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */void *stack;atomic_t usage;unsigned int flags;	/* per process flags, defined below */unsigned int ptrace;#ifdef CONFIG_SMPstruct llist_node wake_entry;int on_cpu;struct task_struct *last_wakee;unsigned long wakee_flips;unsigned long wakee_flip_decay_ts;int wake_cpu;
#endifint on_rq;int prio, static_prio, normal_prio;unsigned int rt_priority;const struct sched_class *sched_class;struct sched_entity se;struct sched_rt_entity rt;……

se中即保存着进程的虚拟运行时间，其记录一个程序到底运行了多长时间以及应该再运行多久

struct sched_entity {struct load_weight	load;		/* for load-balancing */struct rb_node		run_node;struct list_head	group_node;unsigned int		on_rq;u64			exec_start;u64			sum_exec_runtime;u64			vruntime;u64			prev_sum_exec_runtime;u64			nr_migrations;#ifdef CONFIG_SCHEDSTATSstruct sched_statistics statistics;
#endif#ifdef CONFIG_FAIR_GROUP_SCHEDint			depth;struct sched_entity	*parent;/* rq on which this entity is (to be) queued: */struct cfs_rq		*cfs_rq;/* rq "owned" by this entity/group: */struct cfs_rq		*my_q;
#endif#ifdef CONFIG_SMP/* Per-entity load-tracking */struct sched_avg	avg;
#endif
};

CFS为每个CPU维护一个调度队列cfs_rq
D:\学习\linux-4.2\kernel\sched\sched.h

2、vruntime值更新

时钟中断产生时，会调用tick_periodic()->update_process_times()->scheduler_tick()

D:\学习\linux-4.2\kernel\time\tick-common.c
static void tick_periodic(int cpu)
{if (tick_do_timer_cpu == cpu) {write_seqlock(&jiffies_lock);/* Keep track of the next tick event */tick_next_period = ktime_add(tick_next_period, tick_period);do_timer(1);write_sequnlock(&jiffies_lock);update_wall_time();}update_process_times(user_mode(get_irq_regs()));profile_tick(CPU_PROFILING);
}D:\学习\linux-4.2\kernel\time\timer.c
/** Called from the timer interrupt handler to charge one tick to the current* process.  user_tick is 1 if the tick is user time, 0 for system.*/
void update_process_times(int user_tick)
{struct task_struct *p = current;/* Note: this timer irq context must be accounted for as well. */account_process_tick(p, user_tick);run_local_timers();rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORKif (in_irq())irq_work_tick();
#endifscheduler_tick();run_posix_cpu_timers(p);
}D:\学习\linux-4.2\kernel\sched\core.c
/** This function gets called by the timer code, with HZ frequency.* We call it with interrupts disabled.*/
void scheduler_tick(void)
{int cpu = smp_processor_id();struct rq *rq = cpu_rq(cpu);struct task_struct *curr = rq->curr;sched_clock_tick();raw_spin_lock(&rq->lock);update_rq_clock(rq);curr->sched_class->task_tick(rq, curr, 0);//更新进程vruntimeupdate_cpu_load_active(rq);calc_global_load_tick(rq);raw_spin_unlock(&rq->lock);perf_event_task_tick();#ifdef CONFIG_SMPrq->idle_balance = idle_cpu(cpu);trigger_load_balance(rq);
#endifrq_last_tick_reset(rq);
}D:\学习\linux-4.2\kernel\sched\fair.c
.task_tick		= task_tick_fair,
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{struct cfs_rq *cfs_rq;struct sched_entity *se = &curr->se;//更新每个进程的vruntimefor_each_sched_entity(se) {cfs_rq = cfs_rq_of(se);entity_tick(cfs_rq, se, queued);}if (numabalancing_enabled)task_tick_numa(rq, curr);update_rq_runnable_avg(rq, 1);
}

cfs_rq:cfs调度器类的运行队列

D:\学习\linux-4.2\kernel\sched\fair.c
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{/** Update run-time statistics of the 'current'.*/update_curr(cfs_rq);/** Ensure that runnable average is periodically updated.*/update_entity_load_avg(curr, 1);update_cfs_rq_blocked_load(cfs_rq, 1);update_cfs_shares(cfs_rq);#ifdef CONFIG_SCHED_HRTICK/** queued ticks are scheduled to match the slice, so don't bother* validating it and just reschedule.*/if (queued) {resched_curr(rq_of(cfs_rq));return;}/** don't let the period tick interfere with the hrtick preemption*/if (!sched_feat(DOUBLE_TICK) &&hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))return;
#endifif (cfs_rq->nr_running > 1)check_preempt_tick(cfs_rq, curr);//检查当前进程是否需要调度
}/** Update the current task's runtime statistics.*/
static void update_curr(struct cfs_rq *cfs_rq)
{struct sched_entity *curr = cfs_rq->curr;u64 now = rq_clock_task(rq_of(cfs_rq));u64 delta_exec;if (unlikely(!curr))return;delta_exec = now - curr->exec_start;if (unlikely((s64)delta_exec <= 0))return;curr->exec_start = now;schedstat_set(curr->statistics.exec_max,max(delta_exec, curr->statistics.exec_max));curr->sum_exec_runtime += delta_exec;schedstat_add(cfs_rq, exec_clock, delta_exec);curr->vruntime += calc_delta_fair(delta_exec, curr);update_min_vruntime(cfs_rq);if (entity_is_task(curr)) {struct task_struct *curtask = task_of(curr);trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);cpuacct_charge(curtask, delta_exec);account_group_exec_runtime(curtask, delta_exec);}account_cfs_rq_runtime(cfs_rq, delta_exec);
}/** delta /= w*/
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{if (unlikely(se->load.weight != NICE_0_LOAD))delta = __calc_delta(delta, NICE_0_LOAD, &se->load);return delta;
}/** delta_exec * weight / lw.weight*   OR* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT** Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case* we're guaranteed shift stays positive because inv_weight is guaranteed to* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.** Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus* weight/lw.weight <= 1, and therefore our shift will also be positive.*/
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
{u64 fact = scale_load_down(weight);int shift = WMULT_SHIFT;unsigned long w;if (likely(lw->inv_weight))return;w = scale_load_down(lw->weight);if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))lw->inv_weight = 1;else if (unlikely(!w))lw->inv_weight = WMULT_CONST;elselw->inv_weight = WMULT_CONST / w;if (unlikely(fact >> 32)) {while (fact >> 32) {fact >>= 1;shift--;}}/* hint to use a 32x32->64 mul */fact = (u64)(u32)fact * lw->inv_weight;while (fact >> 32) {fact >>= 1;shift--;}return mul_u64_u32_shr(delta_exec, fact, shift);
}/** Nice levels are multiplicative, with a gentle 10% change for every* nice level changed. I.e. when a CPU-bound task goes from nice 0 to* nice 1, it will get ~10% less CPU time than another CPU-bound task* that remained on nice 0.** The "10% effect" is relative and cumulative: from _any_ nice level,* if you go up 1 level, it's -10% CPU usage, if you go down 1 level* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.* If a task goes up by ~10% and another task goes down by ~10% then* the relative distance between them is ~25%.)*/
static const int prio_to_weight[40] = {/* -20 */     88761,     71755,     56483,     46273,     36291,/* -15 */     29154,     23254,     18705,     14949,     11916,/* -10 */      9548,      7620,      6100,      4904,      3906,/*  -5 */      3121,      2501,      1991,      1586,      1277,/*   0 */      1024,       820,       655,       526,       423,/*   5 */       335,       272,       215,       172,       137,/*  10 */       110,        87,        70,        56,        45,/*  15 */        36,        29,        23,        18,        15,
};/** Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.** In cases where the weight does not change often, we can use the* precalculated inverse to speed up arithmetics by turning divisions* into multiplications:*/
static const u32 prio_to_wmult[40] = {/* -20 */     48388,     59856,     76040,     92818,    118348,/* -15 */    147320,    184698,    229616,    287308,    360437,/* -10 */    449829,    563644,    704093,    875809,   1099582,/*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,/*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,/*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,/*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,/*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

每差一级cpu占用时间差10%左右，
有两个nice值为0的进程同时占用cpu，那么它们应该每人占50%的cpu，如果将其中一个进程的nice值调整为1的话，那么此时应保证优先级高的进程比低的多占用10%的cpu，就是nice值为0的占55%，nice值为1的占45%。那么它们占用cpu时间的比例为55:45。这个值的比例约为1.25。就是说，相邻的两个nice值之间的cpu占用时间比例的差别应该大约为1.25。

vruntime=delta_exec*nice_0_weight/weight
其中delta_exec是实际时间，nice_0_weight表示nice为0权重值，weight表示该进程的权重值，可以通过prio_to_weight[]获取。

CPU调度是内核中会频繁进行处理的一个时间，于是上面的delta vruntime的运算会被频繁计算。除法运算会占用更多的cpu时间，所以内核编程中的一个原则就是，尽可能的不用除法。

vruntime=delta_execnice_0_weight2^32/weight>>32
其中232/weight可以用inv_weight来表示，其中inv_weight可以从prio_to_wmult[]中获取。
vruntime=delta_execnice_0_weightinv_weight>>32

检查是否需要进程调度

** Preempt the current task with a newly woken task if needed:*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{unsigned long ideal_runtime, delta_exec;struct sched_entity *se;s64 delta;ideal_runtime = sched_slice(cfs_rq, curr);							//理论上的处理器运行时间片delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;	//该进程本轮调度累计运行时间if (delta_exec > ideal_runtime) {				//超过则标记调度标志，否则就继续执行resched_curr(rq_of(cfs_rq));/** The current task ran long enough, ensure it doesn't get* re-elected due to buddy favours.*/clear_buddies(cfs_rq, curr);return;}/** Ensure that a task that missed wakeup preemption by a* narrow margin doesn't have to wait for a full slice.* This also mitigates buddy induced latencies under load.*/if (delta_exec < sysctl_sched_min_granularity)return;se = __pick_first_entity(cfs_rq);delta = curr->vruntime - se->vruntime;if (delta < 0)return;if (delta > ideal_runtime)resched_curr(rq_of(cfs_rq));
}

3、如何选择下一个可执行进程

CFS选择具有最小的vruntime值的进程作为下一个可执行进程，使用红黑树来存储，键值就是vruntime，最左叶子节点作为下一个可执行进程，O（log(n)），为了更快，缓存最左叶子left_most

D:\学习\linux-4.2\kernel\sched\core.c
__schedule() is the main scheduler function.
static void __sched __schedule(void)
{next = pick_next_task(rq, prev);p = fair_sched_class.pick_next_task(rq, prev);pick_next_task_fair(struct rq *rq, struct task_struct *prev)se = pick_next_entity(cfs_rq, curr);struct sched_entity *left = __pick_first_entity(cfs_rq);put_prev_entity(cfs_rq, pse);//处理上一个进程，插入红黑树set_next_entity(cfs_rq, se); //设置下一个进程，移出红黑树rq = context_switch(rq, prev, next); /* unlocks the rq */ 调用      switch_to      
}D:\学习\linux-4.2\kernel\sched\fair.c
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{struct rb_node *left = cfs_rq->rb_leftmost;if (!left)return NULL;return rb_entry(left, struct sched_entity, run_node);
}

处理上一个进程（当前进程）
put_prev_entity()
__enqueue_entity() 加入红黑树

/** Enqueue an entity into the rb-tree:*/
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;struct rb_node *parent = NULL;struct sched_entity *entry;int leftmost = 1;/** Find the right place in the rbtree:*/while (*link) {parent = *link;entry = rb_entry(parent, struct sched_entity, run_node);/** We dont care about collisions. Nodes with* the same key stay together.*/if (entity_before(se, entry)) {link = &parent->rb_left;} else {link = &parent->rb_right;leftmost = 0;}}/** Maintain a cache of leftmost tree entries (it is frequently* used):*/if (leftmost)cfs_rq->rb_leftmost = &se->run_node;rb_link_node(&se->run_node, parent, link);rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{if (cfs_rq->rb_leftmost == &se->run_node) {struct rb_node *next_node;next_node = rb_next(&se->run_node);cfs_rq->rb_leftmost = next_node;}rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

至此完成进程切换

4、新建进程如何确定vruntime以及加入红黑树

kernel\fork.c __do_fork()
copy_process()
kernel\sched\core.c sched_fork()
sched_class->task_fork()
kernel\sched\fair.c task_fork_fair()

static void task_fork_fair(struct task_struct *p)
{struct cfs_rq *cfs_rq;struct sched_entity *se = &p->se, *curr;int this_cpu = smp_processor_id();struct rq *rq = this_rq();unsigned long flags;raw_spin_lock_irqsave(&rq->lock, flags);update_rq_clock(rq);cfs_rq = task_cfs_rq(current);curr = cfs_rq->curr;/** Not only the cpu but also the task_group of the parent might have* been changed after parent->se.parent,cfs_rq were copied to* child->se.parent,cfs_rq. So call __set_task_cpu() to make those* of child point to valid ones.*/rcu_read_lock();__set_task_cpu(p, this_cpu);rcu_read_unlock();update_curr(cfs_rq);if (curr)se->vruntime = curr->vruntime;place_entity(cfs_rq, se, 1);if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {/** Upon rescheduling, sched_class::put_prev_task() will place* 'current' within the tree based on its new key value.*///交换swap(curr->vruntime, se->vruntime);resched_curr(rq);}se->vruntime -= cfs_rq->min_vruntime;raw_spin_unlock_irqrestore(&rq->lock, flags);
}static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{u64 vruntime = cfs_rq->min_vruntime;/** The 'current' period is already promised to the current tasks,* however the extra weight of the new task will slow them down a* little, place the new task so that it fits in the slot that* stays open at the end.*/if (initial && sched_feat(START_DEBIT))vruntime += sched_vslice(cfs_rq, se);/* sleeps up to a single latency don't count. */if (!initial) {unsigned long thresh = sysctl_sched_latency;/** Halve their sleep time's effect, to allow* for a gentler effect of sleepers:*/if (sched_feat(GENTLE_FAIR_SLEEPERS))thresh >>= 1;vruntime -= thresh;}/* ensure we never gain time by being placed backwards. */se->vruntime = max_vruntime(se->vruntime, vruntime);
}

子进程在创建时，vruntime初值首先被设置为min_vruntime；
然后，如果sched_features中设置了START_DEBIT位，vruntime会在min_vruntime的基础上再增大一些。
设置完子进程的vruntime之后，检查sched_child_runs_first参数，如果为1的话，就比较父进程和子进程的vruntime，若是父进程的vruntime更小，就对换父、子进程的vruntime，这样就保证了子进程会在父进程之前运行。

加入红黑树
wake_up_new_task()
kernel\sched\core.c activate_task()
enqueue_task()
kernel\sched\fair.c enqueue_task_fair()

5、睡眠进程

如果休眠进程的 vruntime 保持不变，而其他运行进程的 vruntime 一直在推进，那么等到休眠进程终于唤醒的时候，它的vruntime比别人小很多，会使它获得长时间抢占CPU的优势，其他进程就要饿死了
在休眠进程被唤醒时重新设置vruntime值，以min_vruntime值为基础，给予一定的补偿，但不能补偿太多。
当initial=1时，是新建进程
当initail=0时，是唤醒进程，vruntime要减去一个thresh，这样对睡眠进程做一个补偿

6、CPU切换

在多CPU的系统上，不同的CPU的负载不一样，有的CPU更忙一些，而每个CPU都有自己的运行队列，每个队列中的进程的vruntime也走得有快有慢，
当进程从一个CPU的运行队列中出来 (dequeue_entity) 的时候，它的vruntime要减去队列的min_vruntime值；
而当进程加入另一个CPU的运行队列 ( enqueue_entiry) 时，它的vruntime要加上该队列的min_vruntime值。

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{/** Update run-time statistics of the 'current'.*/update_curr(cfs_rq);dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);update_stats_dequeue(cfs_rq, se);if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATSif (entity_is_task(se)) {struct task_struct *tsk = task_of(se);if (tsk->state & TASK_INTERRUPTIBLE)se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));if (tsk->state & TASK_UNINTERRUPTIBLE)se->statistics.block_start = rq_clock(rq_of(cfs_rq));}
#endif}clear_buddies(cfs_rq, se);if (se != cfs_rq->curr)__dequeue_entity(cfs_rq, se);se->on_rq = 0;account_entity_dequeue(cfs_rq, se);/** Normalize the entity after updating the min_vruntime because the* update can refer to the ->curr item and we need to reflect this* movement in our normalized position.*/if (!(flags & DEQUEUE_SLEEP))se->vruntime -= cfs_rq->min_vruntime;/* return excess runtime on last dequeue */return_cfs_rq_runtime(cfs_rq);update_min_vruntime(cfs_rq);update_cfs_shares(cfs_rq);
}

参考文档：
https://blog.csdn.net/hs794502825/article/details/10495161
https://www.jianshu.com/p/673c9e4817a8
https://www.cnblogs.com/arnoldlu/p/8465034.html
《Linux内核设计与实现》

本文来自互联网用户投稿，文章观点仅代表作者本人，不代表本站立场，不承担相关法律责任。如若转载，请注明出处。 如若内容造成侵权/违法违规/事实不符，请点击【内容举报】进行投诉反馈！

标签：技术

上一篇 > 一文讲解Linux进程调度-CFS调度器
下一篇 > CFS调度器（原理-＞源码-＞总结）

Duilib中list控件支持ctrl和shif多行选中的实现

[ICML2015]Batch Normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shif

win10系统微软输入法于eclipse ctrl+shif+f冲突间接处理办法

Codeforces Round #259 (Div. 2) B. Little Pony and Sort by Shif

读LDD3，内存映射与DMA--PAGE_SHIF…

VMware虚拟机安装XP【要先分区，再设置BOOT 启动CD，shif+上移】

更换iBus五笔的左与右Shif

sublime ctrl+shif+f 没用解决办法

idea 对 ctrl + z 的撤销是 ctrl + shif + z

计算机最早的设计师应用于,计算机应用基础选择题doc.doc

win10自带截图神器：Win+Shift+S

Python基础之文件目录操作

python简述目录_Python基础之文件目录操作(示例代码)

tp5 如何做数据采集

任务2-7(服务器字体+阿里巴巴矢量库)

html标签（1)：h1~h6,p,br,pre,hr

TI 电量计介绍与芯片选型指南

几款TI电源芯片简介

TI DSP芯片C2000系列读取FLASH数据

德州仪器(Ti)平台嵌入式开发基础

TI三相电机智能栅极驱动芯片特点分类

省选模拟（12.08） T3 圈圈圈圈圈圈圈圈

Hadoop生态圈技术栈（上）

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之6.Impala交互式查询

小猿圈之Linux下Mysql 操作命令

大数据Hadoop生态圈常用面试题

大数据开发基础入门与项目实战（三）Hadoop核心及生态圈技术栈之4.Hive DDL、DQL和数据操作

备战Noip2018模拟赛11（B组）T3 Monogatari 物语

【智能优化算法-圆圈搜索算法】基于圆圈搜索算法Circle Search Algorithm求解单目标优化问题附matlab代码

NYOJ 78 圈水池

递归问题跑道汽车绕圈问题 Python实现

Hadoop生态圈（三）：MapReduce

CFS公平调度算法（源码）