Linux内核源代码情景分析-wait()、schedule()

    父进程执行wait4,并调用schedule切换到子进程:

    wait4(child, NULL, 0, NULL);

    像其他系统调用一样,wait4()在内核中的入口是sys_wait4(),代码如下:

asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru)//pid为子进程的进程号
{int flag, retval;DECLARE_WAITQUEUE(wait, current);struct task_struct *tsk;if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))return -EINVAL;add_wait_queue(current->wait_chldexit,&wait);
repeat:flag = 0;current->state = TASK_INTERRUPTIBLE;//父进程设置为可中断等待状态read_lock(&tasklist_lock);tsk = current;do {//第一层循环struct task_struct *p;for (p = tsk->p_cptr ; p ; p = p->p_osptr) {//第二层循环,从最年轻的子进程开始沿着由各个task_struct结构中的指针p_osptr所形成的链,找寻与所等待对象的pid相符的子进程、或符合其他一些条件的子进程if (pid>0) {if (p->pid != pid)//找到pid相符的子进程continue;} else if (!pid) {if (p->pgrp != current->pgrp)continue;} else if (pid != -1) {if (p->pgrp != -pid)continue;}/* Wait for all children (clone and not) if __WALL is set;* otherwise, wait for clone children *only* if __WCLONE is* set; otherwise, wait for non-clone children *only*.  (Note:* A "clone" child here is one that reports to its parent* using a signal other than SIGCHLD.) */if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))//要求子进程发送的是SIGCHLD信号&& !(options & __WALL))continue;flag = 1;//说明pid是当前进程的子进程号switch (p->state) {case TASK_STOPPED:if (!p->exit_code)continue;if (!(options & WUNTRACED) && !(p->ptrace & PT_PTRACED))continue;read_unlock(&tasklist_lock);retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; if (!retval && stat_addr) retval = put_user((p->exit_code << 8) | 0x7f, stat_addr);if (!retval) {p->exit_code = 0;retval = p->pid;}goto end_wait4;//子进程处于停止状态,goto end_wait4case TASK_ZOMBIE:current->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime;current->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime;read_unlock(&tasklist_lock);retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;if (!retval && stat_addr)retval = put_user(p->exit_code, stat_addr);if (retval)goto end_wait4; retval = p->pid;if (p->p_opptr != p->p_pptr) {write_lock_irq(&tasklist_lock);REMOVE_LINKS(p);p->p_pptr = p->p_opptr;SET_LINKS(p);do_notify_parent(p, SIGCHLD);write_unlock_irq(&tasklist_lock);} elserelease_task(p);//将子进程task_struct结构和系统空间堆栈,全部释放goto end_wait4;子进程处于僵死状态,goto end_wait4default:continue;//否则继续第二层循环}}if (options & __WNOTHREAD)break;tsk = next_thread(tsk);//从同一个thread_group队列中找到下一个线程的task_struct结构} while (tsk != current);read_unlock(&tasklist_lock);if (flag) {//如果pid不是当前进程的子进程,直接到end_wait4retval = 0;if (options & WNOHANG)goto end_wait4;retval = -ERESTARTSYS;if (signal_pending(current))goto end_wait4;schedule();goto repeat;}retval = -ECHILD;
end_wait4:current->state = TASK_RUNNING;remove_wait_queue(¤t->wait_chldexit,&wait);return retval;
}

    下列条件之一得到满足时才结束,goto end_wait4:

    1、所等待的子进程的状态变成TASK_STOPPED,TASK_ZOMBIE;

    2、所等待的子进程存在,可不在上述两个状态,而调用参数options中的WHONANG标志位为1,或者当前进程接受到了其他的信号;

    3、进程号pid的那个进程根本不存在,或者不是当前进程的子进程。

    否则,当前进程将其自身的状态设成TASK_INTERRUPTIBLE,并调用schedule()。


    schedule,代码如下:

asmlinkage void schedule(void)
{struct schedule_data * sched_data;struct task_struct *prev, *next, *p;struct list_head *tmp;int this_cpu, c;if (!current->active_mm) BUG();//如果当前进程是个内核线程,那就没有用户空间,所以其mm指针为0,运行时就要暂时借用在它之前运行的那个进程的active_mm,所以active_mm一定不等于0
need_resched_back:prev = current;//当前进程赋值给prevthis_cpu = prev->processor;if (in_interrupt())//只能由进程在内核中主动调用,或者在当前进程从系统空间返回用户空间的前夕被动地发生,而不能在一个中断服务程序的内部发生goto scheduling_in_interrupt;release_kernel_lock(prev, this_cpu);/* Do "administrative" work here while we don't hold any locks */if (softirq_active(this_cpu) & softirq_mask(this_cpu))//处理软中断goto handle_softirq;
handle_softirq_back:/** 'sched_data' is protected by the fact that we can run* only one process per CPU.*/sched_data = & aligned_data[this_cpu].schedule_data;spin_lock_irq(&runqueue_lock);/* move an exhausted RR process to be last.. */if (prev->policy == SCHED_RR)//见注释1goto move_rr_last;
move_rr_back:switch (prev->state) {case TASK_INTERRUPTIBLE://TASK_UNINTERRUPTIBLE和TASK_INTERRUPTIBLE的主要区别就在于此,TASK_UNINTERRUPTIBLE即使有信号等待处理,也不将其修改成TASK_RUNNINGif (signal_pending(prev)) {//有信号等待处理时要将其改成TASK_RUNNINGprev->state = TASK_RUNNING;break;}default:del_from_runqueue(prev);//sys_wait4中调用schedule时的状态为TASK_INTERRUPTIBLE,所以这里把这进程从可执行队列中撤下来case TASK_RUNNING://如果是TASK_RUNNING,即继续运行,那么这里不需要有什么特殊处理}prev->need_resched = 0;//刚开始need_reshced清0/** this is the scheduler proper:*/repeat_schedule:/** Default process to select..*/next = idle_task(this_cpu);//目前是进程0,指向已知最佳的候选进程c = -1000;//目前是最低的权值,指向这个进程的综合权值if (prev->state == TASK_RUNNING)//如果当前进程想要继续运行goto still_running;still_running_back:list_for_each(tmp, &runqueue_head) {//遍历可执行队列runqueue中的每个进程p = list_entry(tmp, struct task_struct, run_list);if (can_schedule(p, this_cpu)) {//单cpu中can_schedule永远为1int weight = goodness(p, this_cpu, prev->active_mm);//进程所具有的权值if (weight > c)//挑选出权值最大的c = weight, next = p;}}/* Do we need to re-calculate counters? */if (!c)//如果当前已经选择的进程(权值最高的进程)权值为0,那么就要重新计算各个进程的时间配额,参考注释2goto recalculate;/** from this point on nothing can prevent us from* switching to the next task, save this fact in* sched_data.*/sched_data->curr = next;......spin_unlock_irq(&runqueue_lock);if (prev == next)//挑选出来的next就是当前进程goto same_process;......kstat.context_swtch++;/** there are 3 processes which are affected by a context switch:** prev == .... ==> (last => next)** It's the 'much more previous' 'prev' that is on next's stack,* but prev is set to (the just run) 'last' process by switch_to().* This might sound slightly confusing but makes tons of sense.*/prepare_to_switch();//空语句{struct mm_struct *mm = next->mm;struct mm_struct *oldmm = prev->active_mm;if (!mm) {//内核线程if (next->active_mm) BUG();next->active_mm = oldmm;//借用一个mm_structatomic_inc(&oldmm->mm_count);enter_lazy_tlb(oldmm, next, this_cpu);} else {if (next->active_mm != mm) BUG();switch_mm(oldmm, mm, next, this_cpu);//用户空间的切换}if (!prev->mm) {//归还刚刚借用的mm_structprev->active_mm = NULL;mmdrop(oldmm);}}/** This just switches the register state and the* stack.*/switch_to(prev, next, prev);//到了最后要切换进程的关头了。所谓进程的切换主要是堆栈的切换__schedule_tail(prev);//将当前进程prev的task_struct结构中policy字段里的SCHED_YIELD标志位清成0same_process:reacquire_kernel_lock(current);if (current->need_resched)//前面已经把当前进程的need_resched清0,如果现在又成了非0,则一定发生了中断并且情况发生了变化goto need_resched_back;return;recalculate:{struct task_struct *p;spin_unlock_irq(&runqueue_lock);read_lock(&tasklist_lock);for_each_task(p)//对所有进程的循环,对不在runqueue的进程,也提升其时间配额,参考注释3p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);read_unlock(&tasklist_lock);spin_lock_irq(&runqueue_lock);}goto repeat_schedule;still_running:c = goodness(prev, this_cpu, prev->active_mm);//那么挑选候选进程时以当前进程此刻的权值开始。这意味着,相对于权值相同的其它进程来说,当前进程优先next = prev;goto still_running_back;handle_softirq:do_softirq();goto handle_softirq_back;move_rr_last:if (!prev->counter) {//如果时间配额用完了prev->counter = NICE_TO_TICKS(prev->nice);move_last_runqueue(prev);//从可执行进程队列runqueue中当前的位置上移到队列的末尾,同时恢复其最初的时间配额,对于相同优先级的进程,调度的时候排在前面的进程优先,所以这使队列中具有相同优先级的其它进程有了优势}goto move_rr_back;scheduling_in_interrupt:printk("Scheduling in interrupt\n");BUG();return;
}
    注释1:

    为了适应各种不同应用的需要,内核在此基础上实现了三种不同的政策:SCHED_FIFO、SCHED_RR以及SCHED_OTHER。每个进程都有自己使用的调度政策,并且进程还可以通过系统调用sched_setscheduler()设定自己使用的调度政策。其中SCHED_FIFO适合于时间性要求比较强(要立刻执行这个进程)、但每次运行所需的时间比较短的进程,实时的应用大都具有这样的特点。SCHED_RR中的“RR”表示“Round Robin”,是轮流的意思,这种政策适合比较大、也就是每次运行需时较长的进程。而除此二者之外的SCHED_OTHER,则为传统的调度政策,比较适合于交互式的分时应用。

    当前进程prev的调度政策为SCHED_RR,即轮换调度。SCHED_RR和SCHED_FIFO都是基于优先级的调度政策,可是在怎样调度具有相同优先级的进程这个问题上二者有区别。调度策略为SCHED_FIFO的进程一旦受到调度而开始运行之后,就要一直运行到自愿让出或被优先级更高的进程剥夺为止。对于每次受到调度时要求运行时间不长的进程,这样并没有什么不妥。可是,如果是受到调度后可能会长时间运行的进程,那样就不公平了。这种不公正性是对具有相同优先级的进程而言。所以,对这样的进程应该实行SCHED_RR调度政策,这种政策在相同的优先级上实行轮换调度。

 

    注释2:

    此时所有runqueue的进程权值都为0,由于除init进程和调用了sched_yield()的进程以外,每个进程的权值最低为0,所以只要队列中有其他就绪进程存在就不可能为负数。这里要指出,队里中所有其他进程的权限都已降到0,说明这些进程的调度政策都是SCHED_OTHER,因为若有政策为SCHED_FIFO或SCHED_RR的进程存在,则权值至少也有100。


    注释3:

    for_each_task()是对所有进程的循环,而不是仅对就绪进程队列的循环。对于不在就绪进程队列中的非实时进程,这里得到了提升其时间配额、从而提升其综合权值的机会。不过,对综合权值的这种提升是很有限的,每次重新计算都将原有的时间配额减半,再与NICE_TO_TICKS(p->nice)相加,这样就决定了重新计算以后的综合权值永远也不可能达到NICE_TO_TICKS(p->nice)的两倍。因此,即使经过很长时间的"韬光养晦",也不能达到可与实时进程竞争的地步(综合权值至少是1000),所以只是对非实时进程之间的竞争有意义。至于实时进程,时间配额的增加并不会提升其综合权值,而且对于SCHED_FIFO进程则连时间配额也是没有意义的。


    goodness,计算进程所具有的综合权值:

static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{int weight;/** select the current process after every other* runnable process, but before the idle thread.* Also, dont trigger a counter recalculation.*/weight = -1;if (p->policy & SCHED_YIELD)goto out;/** Non-RT process - normal case first.*/if (p->policy == SCHED_OTHER) {//如果是非实时进程/** Give the process a first-approximation goodness value* according to the number of clock-ticks it has left.** Don't do any other calculations if the time slice is* over..*/weight = p->counter;if (!weight)goto out;#ifdef CONFIG_SMP/* Give a largish advantage to the same processor...   *//* (this is equivalent to penalizing other processors) */if (p->processor == this_cpu)weight += PROC_CHANGE_PENALTY;
#endif/* .. and a slight advantage to the current MM */if (p->mm == this_mm || !p->mm)//如果是个内核线程,或者其用户空间与当前进程的相同,因而无需切换用户空间,则会得到一点小奖励,将权值加1weight += 1;weight += 20 - p->nice;//进程优先级nice,取值范围是19到-20,以-20为最高goto out;}/** Realtime process, select the first one on the* runqueue (taking priorities within processes* into account).*/weight = 1000 + p->rt_priority;//对于实时进程,即调度政策为SCHED_FIFO或SCHED_RR的进程,则另有一种正向的优先级,那就是rt_priority,而权值是(1000+p->rt_priority)。可见,SCHED_FIFO和SCHED_RR两种有时间要求的政策赋予进程很高的权值(相对于SCHED_OTHER),这种进程的权值至少是1000
out:return weight;
}


    switch_mm,对用户空间的切换:

static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
{if (prev != next) {/* stop flush ipis for the previous mm */clear_bit(cpu, &prev->cpu_vm_mask);/** Re-load LDT if necessary*/if (prev->context.segments != next->context.segments)load_LDT(next);
#ifdef CONFIG_SMPcpu_tlbstate[cpu].state = TLBSTATE_OK;cpu_tlbstate[cpu].active_mm = next;
#endifset_bit(cpu, &next->cpu_vm_mask);/* Re-load page tables */asm volatile("movl %0,%%cr3": :"r" (__pa(next->pgd)));//我们只关心这一句,将新进程页面目录的起始物理地址装入到控制寄存器CR3中}
#ifdef CONFIG_SMPelse {cpu_tlbstate[cpu].state = TLBSTATE_OK;if(cpu_tlbstate[cpu].active_mm != next)BUG();if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {/* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must flush our tlb.*/local_flush_tlb();}}
#endif
}

    switch_to,到了最后要切换进程的关头了。所谓进程的切换主要是堆栈的切换,假设 next为fork出来的子进程,要切换到子进程,代码如下:

#define switch_to(prev,next,last) do {					\asm volatile("pushl %%esi\n\t"					\ //把esi存入现在进程prev的堆栈"pushl %%edi\n\t"					\ //把edi存入现在进程prev的堆栈"pushl %%ebp\n\t"					\ //把ebp存入现在进程prev的堆栈"movl %%esp,%0\n\t"	/* save ESP */		\ //现在进程prev的esp保存在prev->thread.esp"movl %3,%%esp\n\t"	/* restore ESP */	\ //将要切换的进程next->thread.esp保存在esp中,堆栈已经切换了 "movl $1f,%1\n\t"		/* save EIP */		\ //现在进程prev的eip(也就是"1:\t"地址)保存在prev->thread.eip"pushl %4\n\t"		/* restore EIP */	\ //将要切换的进程next->thread.eip保存在eip中"jmp __switch_to\n"				\ //且不说__switch_to中干了些什么,当CPU执行到那里的ret指令时,由于是通过jmp指令转过去的,最后进入堆栈的next->thread.eip就变成了返回地址"1:\t"						\ //如果切换的不是子进程,next->thread.eip实际上就是上一次保存在prev->thread.eip,也就是这一行语句"popl %%ebp\n\t"					\ //由于堆栈已经切换过来,pop出的都是上面存入进程prev堆栈的内容"popl %%edi\n\t"					\"popl %%esi\n\t"					\:"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\"=b" (last)					\:"m" (next->thread.esp),"m" (next->thread.eip),	\"a" (prev), "d" (next),				\"b" (prev));					\
} while (0)
    还记得子进程copy_thread时,设置了thread.esp和thread.eip:

int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,unsigned long unused,struct task_struct * p, struct pt_regs * regs)
{struct pt_regs * childregs;childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;//指向了子进程系统空间堆栈中的pt_regs结构struct_cpy(childregs, regs);//把当前进程系统空间堆栈中的pt_regs结构复制过去childregs->eax = 0;//子进程系统空间堆栈中的pt_regs结构eax置成0childregs->esp = esp;//子进程系统空间堆栈中的pt_regs结构esp置成这里的参数esp,在fork中,则来自调用do_fork()前夕的regs.esp,所以实际上并没有改变p->thread.esp = (unsigned long) childregs;//子进程系统空间堆栈中pt_regs结构的起始地址p->thread.esp0 = (unsigned long) (childregs+1);//指向子进程的系统空间堆栈的顶端p->thread.eip = (unsigned long) ret_from_fork;savesegment(fs,p->thread.fs);savesegment(gs,p->thread.gs);unlazy_fpu(current);struct_cpy(&p->thread.i387, ¤t->thread.i387);return 0;
}
    所以,此时堆栈已经切换到子进程系统空间堆栈中pt_regs结构的起始地址,eip为ret_from_fork。

  

    __switch_to,代码如下:

void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{struct thread_struct *prev = &prev_p->thread,*next = &next_p->thread;struct tss_struct *tss = init_tss + smp_processor_id();unlazy_fpu(prev_p);/** Reload esp0, LDT and the page table pointer:*/tss->esp0 = next->esp0;//将TSS中的内核空间(0级)堆栈指针换成next->esp0,指向子进程的系统空间堆栈的顶端/** Save away %fs and %gs. No need to save %es and %ds, as* those are always kernel segments while inside the kernel.*/asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));/** Restore %fs and %gs.*/loadsegment(fs, next->fs);loadsegment(gs, next->gs);/** Now maybe reload the debug registers*/if (next->debugreg[7]){loaddebug(next, 0);loaddebug(next, 1);loaddebug(next, 2);loaddebug(next, 3);/* no 4 and 5 */loaddebug(next, 6);loaddebug(next, 7);}if (prev->ioperm || next->ioperm) {if (next->ioperm) {/** 4 cachelines copy ... not good, but not that* bad either. Anyone got something better?* This only affects processes which use ioperm().* [Putting the TSSs into 4k-tlb mapped regions* and playing VM tricks to switch the IO bitmap* is not really acceptable.]*/memcpy(tss->io_bitmap, next->io_bitmap,IO_BITMAP_SIZE*sizeof(unsigned long));tss->bitmap = IO_BITMAP_OFFSET;} else/** a bitmap offset pointing outside of the TSS limit* causes a nicely controllable SIGSEGV if a process* tries to use a port IO instruction. The first* sys_ioperm() call sets up the bitmap properly.*/tss->bitmap = INVALID_IO_BITMAP_OFFSET;}
}
  

    jmp __switch_to后,ret返回到ret_from_fork,继续执行:

ENTRY(ret_from_fork)pushl %ebxcall SYMBOL_NAME(schedule_tail)addl $4, %espGET_CURRENT(%ebx)testb $0x02,tsk_ptrace(%ebx)	# PT_TRACESYSjne tracesys_exitjmp	ret_from_sys_call
ENTRY(ret_from_sys_call)
#ifdef CONFIG_SMPmovl processor(%ebx),%eaxshll $CONFIG_X86_L1_CACHE_SHIFT,%eaxmovl SYMBOL_NAME(irq_stat)(,%eax),%ecx		# softirq_activetestl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx	# softirq_mask
#elsemovl SYMBOL_NAME(irq_stat),%ecx		# softirq_activetestl SYMBOL_NAME(irq_stat)+4,%ecx	# softirq_mask
#endifjne   handle_softirqret_with_reschedule:cmpl $0,need_resched(%ebx)jne reschedulecmpl $0,sigpending(%ebx)jne signal_return
restore_all:RESTORE_ALL
    RESTORE_ALL,由于在 copy_thread时,childregs->eax = 0,所以返回用户空间返回值为0。也就是执行这里面的代码。

 if(!(child = fork()))  {  /* child */  execve("/bin/echo", args, NULL});  printf("I am back, something is wrong!\n");  } 

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

    已经切换到父进程,从schedule返回,goto repeat重新执行sys_wait4,这回子进程是TASK_ZOMBIE,所以调用release_task,将子进程task_struct结构和系统空间堆栈,全部释放。

static void release_task(struct task_struct * p)
{if (p != current) {
#ifdef CONFIG_SMP/** Wait to make sure the process isn't on the* runqueue (active on some other CPU still)*/for (;;) {task_lock(p);if (!p->has_cpu)break;task_unlock(p);do {barrier();} while (p->has_cpu);}task_unlock(p);
#endifatomic_dec(&p->user->processes);free_uid(p->user);unhash_process(p);release_thread(p);current->cmin_flt += p->min_flt + p->cmin_flt;current->cmaj_flt += p->maj_flt + p->cmaj_flt;current->cnswap += p->nswap + p->cnswap;/** Potentially available timeslices are retrieved* here - this way the parent does not get penalized* for creating too many processes.** (this cannot be used to artificially 'generate'* timeslices, because any timeslice recovered here* was given away by the parent in the first place.)*/current->counter += p->counter;if (current->counter >= MAX_COUNTER)current->counter = MAX_COUNTER;free_task_struct(p);//将task_struct结构和系统空间堆栈所占据的两个物理页面释放} else {printk("task releasing itself\n");}
}
#define free_task_struct(p) free_pages((unsigned long) (p), 1)


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部