linux0.11系统调用过程和fork源码解析

首先看一下fork函数的定义。

static inline _syscall0(int,fork)

#define __NR_fork	2

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
	// 输如输出都是eax，输入是系统调用函数在系统调用表的序号
	: "=a" (__res) \
	: "0" (__NR_##name)); \
if (__res >= 0) \
	return (type) __res; \
errno = -__res; \
return -1; \
}

fork函数利用int 80触发中断。在sched.c里注册了该中断的处理程序。

set_system_gate(0x80,&system_call);

所以执行fork函数就会执行system_call函数，但是在这之前，还有些事情需要做，就是保存现场。下面是操作系统执行系统调用前，在内核栈里保存的寄存器，这个压入的寄存器和iret中断返回指令出栈的寄存器是对应的。其中ip指向的是调用系统调用返回后的下一句代码。
在这里插入图片描述
继续看一下system_call的代码。

.align 2
bad_sys_call:
	movl $-1,%eax
	iret
.align 2
reschedule:
	pushl $ret_from_sys_call
	// 执行schedule
	jmp _schedule
.align 2
_system_call:
	// 比较参数，不合法的参数直接返回中断，错误码是-1
	cmpl $nr_system_calls-1,%eax
	ja bad_sys_call
	// 寄存器压栈，保存现场和用户传递的参数
	push %ds
	push %es
	push %fs
	// 执行系统调用的函数时用户传入的三个参数，右到左，ebx是第一个参数
	pushl %edx
	pushl %ecx		# push %ebx,%ecx,%edx as parameters
	pushl %ebx		# to the system call
	// 0x10是内核数据段的选择子
	movl $0x10,%edx		# set up ds,es to kernel space
	mov %dx,%ds
	mov %dx,%es
	movl $0x17,%edx		# fs points to local data space
	mov %dx,%fs
	// 根据参数，从系统表格里找到对应的函数，每个函数地址4个字节
	call _sys_call_table(,%eax,4)
	// 系统调用的返回值，压栈保存，因为下面需要用eax
	pushl %eax
	// 把当前进程的pcb地址赋值给eax
	movl _current,%eax
	// 判断当前进程状态，0是可执行，即判断当前进程是否可以继续执行
	cmpl $0,state(%eax)		# state
	// CMP结果为0则zf等于1，jne是cf为0则跳转，所以下面是当前进程state不为0，则跳转，即重新调度
	jne reschedule
	// 时间片用完则重新调度
	cmpl $0,counter(%eax)		# counter
	je reschedule
ret_from_sys_call:
	movl _current,%eax		# task[0] cannot have signals
	// 判断当前执行的进程是不是0号进程
	cmpl _task,%eax
	// 是的话跳到标签3
	je 3f
	cmpw $0x0f,CS(%esp)		# was old code segment supervisor ?
	jne 3f
	cmpw $0x17,OLDSS(%esp)		# was stack segment = 0x17 ?
	jne 3f
	// 把这两个字段赋值给寄存器
	movl signal(%eax),%ebx
	movl blocked(%eax),%ecx
	// 对block变量的值取反，即没有屏蔽的为变成1，表示需要处理的信号
	notl %ecx
	// 把收到的信号signal和没有屏蔽的信号，得到需要处理的信号，放到ecx中
	andl %ebx,%ecx
	/*
		Bit Scan Forward,如果ecx等于0，则cf等于1，否则cf是0
		从低位到高位扫描ecx，把等于第一个是1的位置写到ecx中，即第一位是1则位置是0
	*/
	bsfl %ecx,%ecx
	// cf=1即ecx是0则跳转，代表没有需要处理的信号则跳转
	je 3f
	// 把ebx的第ecx位清0，并把1移到CF，处理了该信号，清0
	btrl %ecx,%ebx
	
	movl %ebx,signal(%eax)
	// 当前需要处理的信号加1，因为ecx保存的是位置，位置是0开始的，信号是1-32
	incl %ecx
	// 入参压栈
	pushl %ecx
	// 执行信号处理函数
	call _do_signal
	popl %eax
3:	popl %eax
	popl %ebx
	popl %ecx
	popl %edx
	pop %fs
	pop %es
	pop %ds
	iret

函数比较长，分段分析，首先看到call _sys_call_table(,%eax,4)这一句。这时候，内核栈是
在这里插入图片描述
因为是段内跳转，所以cs不需要入栈。ip指向call _sys_call_table(,%eax,4)下面一句代码。我们首先进入call _sys_call_table(,%eax,4)里面看。


fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read,
sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link,
sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod,
sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount,
sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm,
sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access,
sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir,
sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid,
sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys,
sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit,
sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid,
sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask,
sys_setreuid,sys_setregid };

其实是一个数组，根据eax即系统函数的编号找到对应的函数执行。这里我们要找的sys_fork。所以我们继续看sys_fork函数的代码。

.align 2
_sys_fork:
	// 执行find_empty_process函数，返回一个进程id在eax里
	call _find_empty_process
	// 看是否找到可用的进程id
	testl %eax,%eax
	// 没有找到调用标签1，即返回
	js 1f
	// 找到则压栈寄存器
	push %gs
	pushl %esi
	pushl %edi
	pushl %ebp
	// 找到的进程id
	pushl %eax
	// 继续调函数
	call _copy_process
	// 出栈上面压进栈的五个寄存器，然后返回
	addl $20,%esp
1:	ret

sys_fork首先执行find_empty_process函数找到一个可用的进程号。代码如下

int find_empty_process(void)
{
	int i;

	repeat:
		// 先找到一个可用的pid
		if ((++last_pid)<0) last_pid=1;
		for(i=0 ; i<NR_TASKS ; i++)
			if (task[i] && task[i]->pid == last_pid) goto repeat;
	// 再找一个可用的pcb项，从1开始，0是init进程
	for(i=1 ; i<NR_TASKS ; i++)
		if (!task[i])
			return i;
	return -EAGAIN;
}

找到后把pid放到eax里，返回sys_fork。根据sys_fork的代码，我们看到继续压栈寄存器然后执行copy_process函数。这时候内核栈是
在这里插入图片描述
然后我们看copy_process函数的代码。

int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
		long ebx,long ecx,long edx,
		long fs,long es,long ds,
		long eip,long cs,long eflags,long esp,long ss)
{
	struct task_struct *p;
	int i;
	struct file *f;
	// 申请一页存pcb
	p = (struct task_struct *) get_free_page();
	if (!p)
		return -EAGAIN;
	// 挂载到全局pcb数组
	task[nr] = p;
	// 复制当前进程的数据
	*p = *current;	/* NOTE! this doesn't copy the supervisor stack */
	p->state = TASK_UNINTERRUPTIBLE;
	p->pid = last_pid;
	p->father = current->pid;
	p->counter = p->priority;
	p->signal = 0;
	p->alarm = 0;
	p->leader = 0;		/* process leadership doesn't inherit */
	p->utime = p->stime = 0;
	p->cutime = p->cstime = 0;
	// 当前时间
	p->start_time = jiffies;
	p->tss.back_link = 0;
	// 页末
	p->tss.esp0 = PAGE_SIZE + (long) p;
	p->tss.ss0 = 0x10;
	// 调用fork时压入栈的ip，子进程创建完成会从这开始执行，即if (__res >= 0) 
	p->tss.eip = eip;
	p->tss.eflags = eflags;
	// 子进程从fork返回的是0，eax会赋值给__res
	p->tss.eax = 0;
	p->tss.ecx = ecx;
	p->tss.edx = edx;
	p->tss.ebx = ebx;
	p->tss.esp = esp;
	p->tss.ebp = ebp;
	p->tss.esi = esi;
	p->tss.edi = edi;
	// 段选择子是16位
	p->tss.es = es & 0xffff;
	p->tss.cs = cs & 0xffff;
	p->tss.ss = ss & 0xffff;
	p->tss.ds = ds & 0xffff;
	p->tss.fs = fs & 0xffff;
	p->tss.gs = gs & 0xffff;
	/*
		计算第nr进程在GDT中关于LDT的索引，切换任务的时候，
		这个索引会被加载到ldt寄存器，cpu会自动根据ldt的值，把
		GDT中相应位置的段描述符加载到ldt寄存器(共16+32+16位)
	*/
	p->tss.ldt = _LDT(nr); 
	p->tss.trace_bitmap = 0x80000000;
	if (last_task_used_math == current)
		__asm__("clts ; fnsave %0"::"m" (p->tss.i387));
	/*
	设置线性地址范围，挂载线性地址首地址和限长到ldt，赋值页目录项和页表
	执行进程的时候，tss选择子被加载到tss寄存器，然后把tss里的上下文
	也加载到对应的寄存器，比如cr3，ldt选择子。tss信息中的ldt索引首先从gdt找到进程ldt
	结构体数据的首地址，然后根据当前段的属性，比如代码段，
	则从cs中取得选择子，系统从ldt表中取得进程线性空间
	的首地址、限长、权限等信息。用线性地址的首地址加上ip
	中的偏移，得到线性地址，然后再通过页目录和页表得到物理
	地址，物理地址还没有分配则进行缺页异常等处理。
	*/
	if (copy_mem(nr,p)) {
		task[nr] = NULL;
		free_page((long) p);
		return -EAGAIN;
	}
	// 父子进程都有同样的文件描述符，file结构体加一
	for (i=0; i<NR_OPEN;i++)
		if (f=p->filp[i])
			f->f_count++;
	// inode节点加一
	if (current->pwd)
		current->pwd->i_count++;
	if (current->root)
		current->root->i_count++;
	if (current->executable)
		current->executable->i_count++;
	/*
		挂载tss和ldt地址到gdt，nr << 1即乘以2，这里算出的是第nr个进程距离第一个tss描述符地址的偏移，
		单位是8个字节，即选择描述符大小
	*/
	set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
	set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
	p->state = TASK_RUNNING;	/* do this last, just in case */
	return last_pid;
}

执行该函数新建了一个pcb结构，然后访问执行addl $20,%esp。出栈上面五个寄存器，并且把ip也出栈了，然后返回call _sys_call_table(,%eax,4)的下一句代码。这时候的内核栈是
在这里插入图片描述
我们继续看call _sys_call_table(,%eax,4)下面的代码。主要是判断是否需要重新调度进程。接下来进行信号的处理。信号另外分析，这里假设没有信号，则直接跳转到标签3。

    popl %eax
	popl %ebx
	popl %ecx
	popl %edx
	pop %fs
	pop %es
	pop %ds
	iret

把剩下的一些寄存器出栈，pop eax即把系统调用的返回值存在eax里。eax的值是在下面的代码处得到的。

call _sys_call_table(,%eax,4)
pushl %eax

最后，通过iret中断返回指令弹出五个寄存器，回到系统调用前的ip处执行。父进程返回值是eax，即子进程id。子进程的eax是0所以是返回值是0。

linux0.11系统调用过程和fork源码解析

猜你喜欢