首页 > 代码库 > 《linux 内核完全剖析》sched.c sched.h 代码分析笔记

《linux 内核完全剖析》sched.c sched.h 代码分析笔记

sched.c sched.h 代码分析笔记

首先上header file

sched.h



#ifndef _SCHED_H
#define _SCHED_H

#define HZ 100

#define NR_TASKS    64
#define TASK_SIZE    0x04000000
#define LIBRARY_SIZE    0x00400000

#if (TASK_SIZE & 0x3fffff)
#error "TASK_SIZE must be multiple of 4M"
#endif

#if (LIBRARY_SIZE & 0x3fffff)
#error "LIBRARY_SIZE must be a multiple of 4M"
#endif

#if (LIBRARY_SIZE >= (TASK_SIZE/2))
#error "LIBRARY_SIZE too damn big!"
#endif

#if (((TASK_SIZE>>16)*NR_TASKS) != 0x10000)
#error "TASK_SIZE*NR_TASKS must be 4GB"
#endif

#define LIBRARY_OFFSET (TASK_SIZE - LIBRARY_SIZE)

#define CT_TO_SECS(x)    ((x) / HZ)
#define CT_TO_USECS(x)    (((x) % HZ) * 1000000/HZ)

#define FIRST_TASK task[0]
#define LAST_TASK task[NR_TASKS-1]

#include <linux/head.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <signal.h>

#if (NR_OPEN > 32)
#error "Currently the close-on-exec-flags and select masks are in one long, max 32 files/proc"
#endif

#define TASK_RUNNING        0
#define TASK_INTERRUPTIBLE    1
#define TASK_UNINTERRUPTIBLE    2
#define TASK_ZOMBIE        3
#define TASK_STOPPED        4

#ifndef NULL
#define NULL ((void *) 0)
#endif

extern int copy_page_tables(unsigned long from, unsigned long to, long size);
extern int free_page_tables(unsigned long from, unsigned long size);

extern void sched_init(void);
extern void schedule(void);
extern void trap_init(void);
extern void panic(const char * str);
extern int tty_write(unsigned minor,char * buf,int count);

typedef int (*fn_ptr)();

struct i387_struct {
    long    cwd;
    long    swd;
    long    twd;
    long    fip;
    long    fcs;
    long    foo;
    long    fos;
    long    st_space[20];    /* 8*10 bytes for each FP-reg = 80 bytes */
};

struct tss_struct {
    long    back_link;    /* 16 high bits zero */
    long    esp0;
    long    ss0;        /* 16 high bits zero */
    long    esp1;
    long    ss1;        /* 16 high bits zero */
    long    esp2;
    long    ss2;        /* 16 high bits zero */
    long    cr3;
    long    eip;
    long    eflags;
    long    eax,ecx,edx,ebx;
    long    esp;
    long    ebp;
    long    esi;
    long    edi;
    long    es;        /* 16 high bits zero */
    long    cs;        /* 16 high bits zero */
    long    ss;        /* 16 high bits zero */
    long    ds;        /* 16 high bits zero */
    long    fs;        /* 16 high bits zero */
    long    gs;        /* 16 high bits zero */
    long    ldt;        /* 16 high bits zero */
    long    trace_bitmap;    /* bits: trace 0, bitmap 16-31 */
    struct i387_struct i387;
};

struct task_struct {
/* these are hardcoded - don‘t touch */
    long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
    long counter;
    long priority;
    long signal;
    struct sigaction sigaction[32];
    long blocked;    /* bitmap of masked signals */
/* various fields */
    int exit_code;
    unsigned long start_code,end_code,end_data,brk,start_stack;
    long pid,pgrp,session,leader;
    int    groups[NGROUPS];
    /*
     * pointers to parent process, youngest child, younger sibling,
     * older sibling, respectively.  (p->father can be replaced with
     * p->p_pptr->pid)
     */
    struct task_struct    *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
    unsigned short uid,euid,suid;
    unsigned short gid,egid,sgid;
    unsigned long timeout,alarm;
    long utime,stime,cutime,cstime,start_time;
    struct rlimit rlim[RLIM_NLIMITS];
    unsigned int flags;    /* per process flags, defined below */
    unsigned short used_math;
/* file system info */
    int tty;        /* -1 if no tty, so it must be signed */
    unsigned short umask;
    struct m_inode * pwd;
    struct m_inode * root;
    struct m_inode * executable;
    struct m_inode * library;
    unsigned long close_on_exec;
    struct file * filp[NR_OPEN];
/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
    struct desc_struct ldt[3];
/* tss for this task */
    struct tss_struct tss;
};

/*
 * Per process flags
 */
#define PF_ALIGNWARN    0x00000001    /* Print alignment warning msgs */
                    /* Not implemented yet, only for 486*/

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x9ffff (=640kB)
 */
#define INIT_TASK /* state etc */    { 0,15,15, /* signals */    0,{{},},0, /* ec,brk... */    0,0,0,0,0,0, /* pid etc.. */    0,0,0,0, /* suppl grps*/ {NOGROUP,}, /* proc links*/ &init_task.task,0,0,0, /* uid etc */    0,0,0,0,0,0, /* timeout */    0,0,0,0,0,0,0, /* rlimits */   { {0x7fffffff, 0x7fffffff}, {0x7fffffff, 0x7fffffff},            {0x7fffffff, 0x7fffffff}, {0x7fffffff, 0x7fffffff},           {0x7fffffff, 0x7fffffff}, {0x7fffffff, 0x7fffffff}}, /* flags */    0, /* math */    0, /* fs info */    -1,0022,NULL,NULL,NULL,NULL,0, /* filp */    {NULL,},     {         {0,0}, /* ldt */    {0x9f,0xc0fa00},         {0x9f,0xc0f200},     }, /*tss*/    {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,     0,0,0,0,0,0,0,0,      0,0,0x17,0x17,0x17,0x17,0x17,0x17,      _LDT(0),0x80000000,         {}     }, }

extern struct task_struct *task[NR_TASKS];
extern struct task_struct *last_task_used_math;
extern struct task_struct *current;
extern unsigned long volatile jiffies;
extern unsigned long startup_time;
extern int jiffies_offset;

#define CURRENT_TIME (startup_time+(jiffies+jiffies_offset)/HZ)

extern void add_timer(long jiffies, void (*fn)(void));
extern void sleep_on(struct task_struct ** p);
extern void interruptible_sleep_on(struct task_struct ** p);
extern void wake_up(struct task_struct ** p);
extern int in_group_p(gid_t grp);

/*
 * Entry into gdt where to find first TSS. 0-nul, 1-cs, 2-ds, 3-syscall
 * 4-TSS0, 5-LDT0, 6-TSS1 etc ...
 */
#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
#define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
#define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))
#define str(n) __asm__("str %%ax\n\t"     "subl %2,%%eax\n\t"     "shrl $4,%%eax"     :"=a" (n)     :"a" (0),"i" (FIRST_TSS_ENTRY<<3))
/*
 *    switch_to(n) should switch tasks to task nr n, first
 * checking that n isn‘t the current task, in which case it does nothing.
 * This also clears the TS-flag if the task we switched to has used
 * tha math co-processor latest.
 */
#define switch_to(n) {struct {long a,b;} __tmp; __asm__("cmpl %%ecx,_current\n\t"     "je 1f\n\t"     "movw %%dx,%1\n\t"     "xchgl %%ecx,_current\n\t"     "ljmp %0\n\t"     "cmpl %%ecx,_last_task_used_math\n\t"     "jne 1f\n\t"     "clts\n"     "1:"     ::"m" (*&__tmp.a),"m" (*&__tmp.b),     "d" (_TSS(n)),"c" ((long) task[n])); }

#define PAGE_ALIGN(n) (((n)+0xfff)&0xfffff000)

#define _set_base(addr,base) __asm__("movw %%dx,%0\n\t"     "rorl $16,%%edx\n\t"     "movb %%dl,%1\n\t"     "movb %%dh,%2"     ::"m" (*((addr)+2)),       "m" (*((addr)+4)),       "m" (*((addr)+7)),       "d" (base)     :"dx")

#define _set_limit(addr,limit) __asm__("movw %%dx,%0\n\t"     "rorl $16,%%edx\n\t"     "movb %1,%%dh\n\t"     "andb $0xf0,%%dh\n\t"     "orb %%dh,%%dl\n\t"     "movb %%dl,%1"     ::"m" (*(addr)),       "m" (*((addr)+6)),       "d" (limit)     :"dx")

#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , base )
#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , (limit-1)>>12 )

#define _get_base(addr) ({unsigned long __base; __asm__("movb %3,%%dh\n\t"     "movb %2,%%dl\n\t"     "shll $16,%%edx\n\t"     "movw %1,%%dx"     :"=d" (__base)     :"m" (*((addr)+2)),      "m" (*((addr)+4)),      "m" (*((addr)+7))); __base;})

#define get_base(ldt) _get_base( ((char *)&(ldt)) )

#define get_limit(segment) ({ unsigned long __limit; __asm__("lsll %1,%0\n\tincl %0":"=r" (__limit):"r" (segment)); __limit;})

#endif





            每个任务都有两个堆栈,分别用于用户态和内核态程序的执行,并且分别称为用户态堆栈和内核态堆栈。处于不同的CPU特权级中,这两个堆栈之间的主要区别在于任务的内核态堆栈很小,所保存的数量最多不能超过4096-任务数据结构块个字节,大约为3K,而任务的用户态堆栈却可以在用户的64M空间内延伸。




show_task

void show_task(int nr,struct task_struct * p)//显示p指向的nr号进程的相关信息
{
    int i,j = 4096-sizeof(struct task_struct);//j记录了任务数据结构之后的堆栈空间大小

    printk("%d: pid=%d, state=%d, father=%d, child=%d, ",nr,p->pid,
        p->state, p->p_pptr->pid, p->p_cptr ? p->p_cptr->pid : -1);//打印关于p指向进程的各种信息
    i=0;
    while (i<j && !((char *)(p+1))[i])//很巧妙的计算了任务数据结构之后的空字节(数据内容为0)的大小
        i++;
    printk("%d/%d chars free in kstack\n\r",i,j);//内核栈最大为j,空字节数是i,分数比率i/j
    printk("   PC=%08X.", *(1019 + (unsigned long *) p));
    //p指向结构体起始地址偏移1019,应该是指数据结构中的TSS结构内EIP处的值(所谓PC指针),eip的值即当前任务用户态的代码指针。
    if (p->p_ysptr || p->p_osptr) //如果p进程有同辈的进程,那么打印它们的进程号
        printk("   Younger sib=%d, older sib=%d\n\r",
            p->p_ysptr ? p->p_ysptr->pid : -1,
            p->p_osptr ? p->p_osptr->pid : -1);
    else
        printk("\n\r");
}
关于show_task讨论的一些帖子

http://www.oldlinux.org/oldlinux/viewthread.php?tid=12182

http://www.oldlinux.org/oldlinux/viewthread.php?tid=14683


show_task

//调用show_task,打印所有非空进程的信息
void show_state(void)
{
    int i;

    printk("\rTask-info:\n\r");
    for (i=0;i<NR_TASKS;i++)
        if (task[i])//扫描task数组,非空即打印对应task[i]进程相关信息
            show_task(i,task[i]);
}




              在内核中的调度程序用于选择系统中下一个要运行的进程。这种选择运行机制是多任务操作系统的基础。调度程序可以看作为处于运行状态都进程之间分配CPU运行时间的管理代码。Linux进程是抢占式的,但被抢占的进程仍处于TASK_RUNNING状态,只是暂时没有被CPU运行。进程的抢占发生在进程处于用于态执行阶段,在内核态执行时是不能被强制的。(0.12的不可以,貌似现在的可以了)


             schdule()函数首先扫描任务数组,通过比较每个就绪状态任务的运行时间递减滴答计数counter的值来确定当前哪个进程运行的时间最少,哪个counter值最大,就表示运行时间还不长,于是就选中该进程,并使用任务切换宏函数到该进程运行。

schedule()

void schedule(void)
{
	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)//把p初始化为指向最后一个进程的地址的指针,逆向扫描所有进程
		if (*p) {//*p 指向当前进程的指针
			if ((*p)->timeout && (*p)->timeout < jiffies) {//这里< 没错,我一直很纠结为什么不是> 这里jiffies是渐变的,持续变的,而timeout 只是作为一个阈值
				(*p)->timeout = 0;
				//如果当前进程等待很久了((*p)->timeout < jiffies),并且这个进程处于TASK_INTERRUPTIBLE
				//我们就把这个进程置与TASK_RUNNING状态
				if ((*p)->state == TASK_INTERRUPTIBLE)
					(*p)->state = TASK_RUNNING;
			}
			if ((*p)->alarm && (*p)->alarm < jiffies) { //如果此时jiffies大于alarm信号周期,则让将SIGALRM写入进程的信号位
				(*p)->signal |= (1<<(SIGALRM-1));
				(*p)->alarm = 0;
			}
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)// 除SIGKILL SIGSTOP信号外,其他信号都是非阻塞状态的话,并且进程处于TASK_INTERRUPTIBLE
				(*p)->state=TASK_RUNNING;//我们就把这个进程置与TASK_RUNNING状态
		}
 
/* this is the scheduler proper: */

	while (1) {
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) {//把所有进程都扫一遍,counter是递减的,找出counter最大的进程,保存在next里面
			if (!*--p)//当前*p指向进程为空,下一个
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
			//counter是任务运行时间计数,注意处于scheduled状态的进程也是在运行是,只是没有使用CPU而已
				c = (*p)->counter, next = i;
		}
		if (c) break;//c>0 就说明找到了已经运行一段时间,并且运行时间最短的进程,跳出while(1)
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)//如果c==0,说明所有schedule的进程都没有运行 
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority; //重新计算counter = counter/2 + priority
	}
	switch_to(next);//让进程next使用CPU
}



       每当选择出一个新的可以运行的进程时,switch_to()宏执行实际进程切换操作。该宏会把CPU的当前进程状态(context)切换成新进程的状态。

       在切换之前,switch_to首先检查要切换的进程是否就是当前进程。如果是,啥也别做,直接退出。如果不是,就把内核全局变量current置为新任务的指针。然后ljmp 长跳转到新任务的状态段TSS组成的地址处,造成CPU执行任务切换操作。此时,CPU会把其所有寄存器的状态保存到当前任务寄存器TR中TSS段选择所指向的当前进程任务数据结构,然后把新任务状态段选择符所指向的新任务数据结构tss结构中的寄存器恢复到CPU中,系统正式开始运行新切换的任务。



switch_to


#define switch_to(n) {struct {long a,b;} __tmp; __asm__("cmpl %%ecx,_current\n\t" \ //进程n是当前current进程,直接结束switch,否则继续je之后的内容
    "je 1f\n\t"     "movw %%dx,%1\n\t" \ //将新任务的TSS的16选择符号存入 _tmp.b 中
    "xchgl %%ecx,_current\n\t" \ //交换ecx 和current的值,这个时候current就是next指向的进程了!
    "ljmp %0\n\t" \ // long jump 把控制流跳转到 %0 _tmp 处 这个long jump比较“特别”,一句两句注释说不清楚,
    //可能看到这里会疑惑都跳转了,下面的语句还有什么用?有用!因为会“跳回来”
    "cmpl %%ecx,_last_task_used_math\n\t" \ // 原任务是否使用过协处理器
    "jne 1f\n\t" \//没用过,跳到l,结束
    "clts\n" \//用过,清理
    "1:" \ //切换TS标识
    ::"m" (*&__tmp.a),"m" (*&__tmp.b),     "d" (_TSS(n)),"c" ((long) task[n])); }

为什么会执行这句话

cmpl %%ecx,_last_task_used_math

既然任务切换时CPU会恢复寄存器现场,那么它当然也会保存寄存器现场了。这些寄存器现场都会被写入原任务的tss结构里,值得注意的是,EIP会指向引起任务切换指令ljmp的下一条指令cmpl,所以,很明显,当原任务有朝一日再次被调度运行时,它将从EIP所指的地方,而这个地方恰巧是cmpl !
比较有用的一个帖子:

http://www.oldlinux.org/oldlinux/archiver/?tid-5390.html





sys_pause

int sys_pause(void) //把当前进程转换成可中断的等待状态,并重新调度
{
    current->state = TASK_INTERRUPTIBLE;
    schedule();
    return 0;
}



                                             

__sleep_on

static inline void __sleep_on(struct task_struct **p, int state)
//看的时候一定要记住,这个_sleep_on 的作用就是把当前进程正等待资源响应或者不在内存时先让他schedule一下,
//让别的程序先运行一段时间的,
//等到自己等待的资源响应之后,这个时候才跳过if判断,执行后面的语句
{
    struct task_struct *tmp;

    if (!p)//常规检查p 为0的时候直接返回
        return;
    if (current == &(init_task.task)) //如果当前进程是
        panic("task[0] trying to sleep");
    tmp = *p;// tmp 指向原等待队列的头指针
    *p = current; //*p 指向等待队列的头指针,把current放入等待队列
    current->state = state;
repeat:    schedule();
    if (*p && *p != current) {
    //如果*p是 等待队列的头指针,不进入,否则goto一直重复schedule,直到当前current进程是*p
        (**p).state = 0;
        current->state = TASK_UNINTERRUPTIBLE;
        goto repeat;
    }
    if (!*p)
        printk("Warning: *P = NULL\n\r");
    if (*p = tmp) // 恢复原来的等待队列,*p 指向原来的等待队列头,逐出current进程
        tmp->state=0; //TASK_RUNNING
}

interruptible_sleep_on

void interruptible_sleep_on(struct task_struct **p) //可中断睡眠
{
    __sleep_on(p,TASK_INTERRUPTIBLE);
}


sleep_on

void sleep_on(struct task_struct **p)//不可中断睡眠
{
    __sleep_on(p,TASK_UNINTERRUPTIBLE);
}

wake_up

void wake_up(struct task_struct **p)//唤醒进程
{
    if (p && *p) {
        if ((**p).state == TASK_STOPPED)
            printk("wake_up: TASK_STOPPED");
        if ((**p).state == TASK_ZOMBIE)
            printk("wake_up: TASK_ZOMBIE");
        (**p).state=0; //TASK_RUNNING
    }
}


get_pid,getppid,getuid,geteuid,getgid,sys_nice

int sys_getpid(void) //各种系统调用查看进程相关信息
{
    return current->pid;
}

int sys_getppid(void)
{

    return current->p_pptr->pid;
}

int sys_getuid(void)
{
    return current->uid;
}

int sys_geteuid(void)
{
    return current->euid;
}

int sys_getgid(void)
{
    return current->gid;
}

int sys_getegid(void)
{
    return current->egid;
}

int sys_nice(long increment)
{
    if(current->priority-increment>0)
       current->priority -=increment;
    return 0;

}



sched_init

void sched_init(void)//schedule 的初始化 被main.c 调用,真心之只能大概看懂,很多初始化设置不知道为什么
{
    int i;
    struct desc_struct * p;

    if (sizeof(struct sigaction) !=16)

       panic("Struct sigactionMUST be 16 bytes");

    set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));

    set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));

    p = gdt+2+FIRST_TSS_ENTRY;

    for(i=1;i<NR_TASKS;i++) {

    //从1开始,跳过了进程init,保护好刚已经设置好的init_task

    //任务清零,描述符清零

       task[i] = NULL;

       p->a=p->b=0; //偏址清零

       p++;

       p->a=p->b=0; //TSS 清零

       p++;

    }
/* Clear NT, so that we won‘t have troubles with that later on */ //从这里我就不知道发生鸟神马。。。T-T

    __asm__("pushfl ; andl$0xffffbfff,(%esp) ; popfl");

    ltr(0);

    lldt(0);

    outb_p(0x36,0x43);       /* binary, mode 3, LSB/MSB, ch 0 */

    outb_p(LATCH & 0xff , 0x40);    /* LSB */

    outb(LATCH >> 8 , 0x40); /* MSB */

    set_intr_gate(0x20,&timer_interrupt);

    outb(inb_p(0x21)&~0x01,0x21);

    set_system_gate(0x80,&system_call);
}