linux内核剖析---fork

fork的功能是什么？

fork的功能是复制进程。进程是一个正在运行的程序，是资源分配的最小单位，系统管理进程是依靠对进程控制管（PCB）的管理完成的，每个进程的产生分两步：

分配pcb
准备进程实体，如分配内存空间等。

fork的底层调用是什么？

fork、clone、vfork它们的底层调用的都是do_fork。

do_fork的工作流程是什么？

1、定义PCB指针struct task_struct *p;

2、分配pid

3、调用copy_process方法，创建子进程的task_struct(重点)

/**
 * 负责处理clone,fork,vfork系统调用。
 * clone_flags-与clone的flag参数相同
 * stack_start-与clone的child_stack相同
 * regs-指向通用寄存器的值。是在从用户态切换到内核态时被保存到内核态堆栈中的。
 * stack_size-未使用,总是为0
 * parent_tidptr,child_tidptr-clone中对应参数ptid,ctid相同
 */
long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      struct pt_regs *regs,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{
	struct task_struct *p;
	int trace = 0;
	/**
	 * 申请PID，通过查找pidmap_array位图,为子进程分配新的pid参数.
	 */
	long pid = alloc_pidmap();  
 
	if (pid < 0)
		return -EAGAIN;
	/**
	 * 如果父进程正在被跟踪,就检查debugger程序是否想跟踪子进程.并且子进程不是内核进程(CLONE_UNTRACED未设置)
	 * 那么就设置CLONE_PTRACE标志.
	 */
	if (unlikely(current->ptrace)) {
		trace = fork_traceflag (clone_flags);
		if (trace)
			clone_flags |= CLONE_PTRACE;
	}
     //之前是对参数的检查
	/**
	 * copy_process复制进程描述符.如果所有必须的资源都是可用的,该函数返回刚创建的task_struct描述符的地址.
	 * 这是创建进程的关键步骤.
	 */
	p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr,pid);

copy_process具体做哪些工作(do_fork第三部工作)

功能：创建进程描述符以及子进程执行所需要的所有其他数据结构

1、分配pcb，继承父进程中pcb的值，只是将特有的信息改过来。通过dup_task_struct(current)来获取进程描述符（pcb）。

/**
 * 创建进程描述符以及子进程执行所需要的所有其他数据结构
 * 它的参数与do_fork相同。外加子进程的PID。
 */
static task_t *copy_process(unsigned long clone_flags,
				 unsigned long stack_start,
				 struct pt_regs *regs,
				 unsigned long stack_size,
				 int __user *parent_tidptr,
				 int __user *child_tidptr,
				 int pid)
{
	int retval;
	struct task_struct *p = NULL;
 
	/**
	 * 检查clone_flags所传标志的一致性。
	 */
 
	/**
	 * 如果CLONE_NEWNS和CLONE_FS标志都被设置，返回错误
	 */
	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
		return ERR_PTR(-EINVAL);
 
	/*
	 * Thread groups must share signals as well, and detached threads
	 * can only be started up within the thread group.
	 */
	/**
	 * CLONE_THREAD标志被设置，并且CLONE_SIGHAND没有设置。
	 * (同一线程组中的轻量级进程必须共享信号)
	 */
	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
		return ERR_PTR(-EINVAL);
 
	/*
	 * Shared signal handlers imply shared VM. By way of the above,
	 * thread groups also imply shared VM. Blocking this case allows
	 * for various simplifications in other code.
	 */
	/**
	 * CLONE_SIGHAND被设置，但是CLONE_VM没有设置。
	 * (共享信号处理程序的轻量级进程也必须共享内存描述符)
	 */
	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
		return ERR_PTR(-EINVAL);
 
	/**
	 * 通过调用security_task_create以及稍后调用security_task_alloc执行所有附加的安全检查。
	 * LINUX2.6提供扩展安全性的钩子函数，与传统unix相比，它具有更加强壮的安全模型。
	 */
	retval = security_task_create(clone_flags);
	if (retval)
		goto fork_out;
 
	retval = -ENOMEM;
	/**
	 * 调用dup_task_struct为子进程获取进程描述符。
	 */
	p = dup_task_struct(current);
}

每一个进程描述符里都有一个thread_info指针,thread_info结构体保存的是进程上下文的信息。要修改thread_info *info，子进程的task_struct成员struct thread_info *info指向自己申请的struct thread_info。而在struct thread_info结构体中有一个struct task_struct类型的指针，这个指针应指向自己的struct task_struct。

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
	struct task_struct *tsk;
	struct thread_info *ti;

	/**
	 * prepare_to_copy中会调用unlazy_fpu。
	 * 它把FPU、MMX和SSE/SSE2寄存器的内容保存到父进程的thread_info结构中。
	 * 稍后，dup_task_struct将把这些值复制到子进程的thread_info中。
	 */
	prepare_to_copy(orig);

	/**
	 * alloc_task_struct宏为新进程获取进程描述符，并将描述符保存到tsk局部变量中。
	 */
	tsk = alloc_task_struct();
	if (!tsk)
		return NULL;

	/**
	 * alloc_thread_info宏获取一块空闲内存区，用来存放新进程的thread_info结构和内核栈。
	 * 这块内存区字段的大小是8KB或者4KB。
	 */
	ti = alloc_thread_info(tsk);
	if (!ti) {
		free_task_struct(tsk);
		return NULL;
	}

	/** 
	 * 将current进程描述符的内容复制到tsk所指向的task_struct结构中，然后把tsk_thread_info置为ti
	 * 将current进程的thread_info内容复制给ti指向的结构中，并将ti_task置为tsk.
	 */
	*ti = *orig->thread_info;
	*tsk = *orig;
	tsk->thread_info = ti;
	ti->task = tsk;
/* One for us, one for whoever does the "release_task()" (usually parent) */
	/**
	 * 把新进程描述符的使用计数器usage设置为2，用来表示描述符正在被使用而且其相应的进程处于活动状态。
	 * 进程状态既不是EXIT_ZOMBIE，也不是EXIT_DEAD
	 */
	atomic_set(&tsk->usage,2);
	return tsk;
}

2、复制父进程打开的文件描述符


/**
 * 复制进程文件描述符
 */
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
	struct files_struct *oldf, *newf;
	struct file **old_fds, **new_fds;
	int open_files, size, i, error = 0, expand;

	/*
	 * A background process may not have any files ...
	 */
	oldf = current->files;
	if (!oldf)
		goto out;

	if (clone_flags & CLONE_FILES) {
		atomic_inc(&oldf->count);
		goto out;
	}

	/*
	 * Note: we may be using current for both targets (See exec.c)
	 * This works because we cache current->files (old) as oldf. Don't
	 * break this.
	 */
	tsk->files = NULL;
	error = -ENOMEM;
	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
	if (!newf) 
		goto out;

	atomic_set(&newf->count, 1);

	spin_lock_init(&newf->file_lock);
	newf->next_fd	    = 0;
	newf->max_fds	    = NR_OPEN_DEFAULT;
	newf->max_fdset	    = __FD_SETSIZE;
	newf->close_on_exec = &newf->close_on_exec_init;
	newf->open_fds	    = &newf->open_fds_init;
	newf->fd	    = &newf->fd_array[0];

	spin_lock(&oldf->file_lock);

	open_files = count_open_files(oldf, oldf->max_fdset);
	expand = 0;

	/*
	 * Check whether we need to allocate a larger fd array or fd set.
	 * Note: we're not a clone task, so the open count won't  change.
	 */
	if (open_files > newf->max_fdset) {
		newf->max_fdset = 0;
		expand = 1;
	}
	if (open_files > newf->max_fds) {
		newf->max_fds = 0;
		expand = 1;
	}

	/* if the old fdset gets grown now, we'll only copy up to "size" fds */
	if (expand) {
		spin_unlock(&oldf->file_lock);
		spin_lock(&newf->file_lock);
		error = expand_files(newf, open_files-1);
		spin_unlock(&newf->file_lock);
		if (error < 0)
			goto out_release;
		spin_lock(&oldf->file_lock);
	}

	old_fds = oldf->fd;
	new_fds = newf->fd;

	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);

	for (i = open_files; i != 0; i--) {
		struct file *f = *old_fds++;
		if (f) {
			get_file(f);
		} else {
			/*
			 * The fd may be claimed in the fd bitmap but not yet
			 * instantiated in the files array if a sibling thread
			 * is partway through open().  So make sure that this
			 * fd is available to the new process.
			 */
			FD_CLR(open_files - i, newf->open_fds);
		}
		*new_fds++ = f;
	}
	spin_unlock(&oldf->file_lock);

	/* compute the remainder to be cleared */
	size = (newf->max_fds - open_files) * sizeof(struct file *);

	/* This is long word aligned thus could use a optimized version */ 
	memset(new_fds, 0, size); 

	if (newf->max_fdset > open_files) {
		int left = (newf->max_fdset-open_files)/8;
		int start = open_files / (8 * sizeof(unsigned long));

		memset(&newf->open_fds->fds_bits[start], 0, left);
		memset(&newf->close_on_exec->fds_bits[start], 0, left);
	}

	tsk->files = newf;
	error = 0;
out:
	return error;

out_release:
	free_fdset (newf->close_on_exec, newf->max_fdset);
	free_fdset (newf->open_fds, newf->max_fdset);
	free_fd_array(newf->fd, newf->max_fds);
	kmem_cache_free(files_cachep, newf);
	goto out;
}

3、复制内存空间。

(1)copy_mm复制地址空间，struct mm_struct *mm, *active_mm,mm表示：进程所拥有的内存空间的描述符，对于内核线程的mm为NULL，active__mm表示：进程运行时所需要使用的进程描述符

判断是否设置了CLONE_VM标志，如果设置，创建进程，新进程共享父进程的地址空间，将mm_user加1，然后mm=oldmm，把父进程的mm_struct指针赋给子进程的mm_struct.
r如果没有设置，当前进程分配一个新的内存描述符，mm=allocate_mm(),将它的弟子放在子进程的mm中。再把父进程（*oldmm）的内容拷进(*mm)中。


/**
 * 当创建一个新的进程时，内核调用copy_mm函数，
 * 这个函数通过建立新进程的所有页表和内存描述符来创建进程的地址空间。
 * 通常，每个进程都有自己的地址空间，但是轻量级进程共享同一地址空间，即允许它们对同一组页进行寻址。
 */
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
{
	struct mm_struct * mm, *oldmm;
	int retval;

	tsk->min_flt = tsk->maj_flt = 0;
	tsk->nvcsw = tsk->nivcsw = 0;

	tsk->mm = NULL;
	tsk->active_mm = NULL;

	/*
	 * Are we cloning a kernel thread?
	 *
	 * We need to steal a active VM for that..
	 */
	oldmm = current->mm;
	/**
	 * 内核线程？？
	 */
	if (!oldmm)
		return 0;

	/**
	 * 指定了CLONE_VM标志，表示创建线程。
	 */
	if (clone_flags & CLONE_VM) {
		/**
		 * 新线程共享父进程的地址空间，所以需要将mm_users加一。
		 */
		atomic_inc(&oldmm->mm_users);
		mm = oldmm;
		/*
		 * There are cases where the PTL is held to ensure no
		 * new threads start up in user mode using an mm, which
		 * allows optimizing out ipis; the tlb_gather_mmu code
		 * is an example.
		 */
		/**
		 * 如果其他CPU持有进程页表自旋锁，就通过spin_unlock_wait保证在释放锁前，缺页处理程序不会结果。
		 * 实际上，这个锁除了保护页表，还必须禁止创建新的轻量级进程。因为它们共享mm描述符
		 */
		spin_unlock_wait(&oldmm->page_table_lock);
		/**
		 * 在good_mm中，将父进程的地址空间赋给子进程。
		 * 注意前面对mm的赋值，表示了新线程使用的mm
		 * 完了，就这么简单
		 */
		goto good_mm;
	}

	/**
	 * 没有CLONE_VM标志，就必须创建一个新的地址空间。
	 * 必须要有地址空间，即使此时并没有分配内存。
	 */
	retval = -ENOMEM;
	/**
	 * 分配一个新的内存描述符。把它的地址存放在新进程的mm中。
	 */
	mm = allocate_mm();
	if (!mm)
		goto fail_nomem;

	/* Copy the current MM stuff.. */
	/**
	 * 并从当前进程复制mm的内容。
	 */
	memcpy(mm, oldmm, sizeof(*mm));
	if (!mm_init(mm))
		goto fail_nomem;

	/**
	 * 调用依赖于体系结构的init_new_context。
	 * 对于80X86来说，该函数检查当前进程是否有定制的局部描述符表。
	 * 如果有，就复制一份局部描述符表并把它插入tsk的地址空间
	 */
	if (init_new_context(tsk,mm))
		goto fail_nocontext;

	/**
	 * dup_mmap不但复制了线程区和页表，也设置了mm的一些属性.
	 * 它也会改变父进程的私有，可写的页为只读的，以使写时复制机制生效。
	 */
	retval = dup_mmap(mm, oldmm);
	if (retval)
		goto free_pt;

	mm->hiwater_rss = mm->rss;
	mm->hiwater_vm = mm->total_vm;

good_mm:
	tsk->mm = mm;
	tsk->active_mm = mm;
	return 0;

free_pt:
	mmput(mm);
fail_nomem:
	return retval;

fail_nocontext:
	/*
	 * If init_new_context() failed, we cannot use mmput() to free the mm
	 * because it calls destroy_context()
	 */
	mm_free_pgd(mm);
	free_mm(mm);
	return retval;
}

（2）复制线性区和页表，设置mm的一些属性，改变父进程的私有，可写的页为只读的，以使写时拷贝技术生效。

/**
 * 既复制父进程的线性区，也复制它的页表。
 */
static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
{
	struct vm_area_struct * mpnt, *tmp, **pprev;
	struct rb_node **rb_link, *rb_parent;
	int retval;
	unsigned long charge;
	struct mempolicy *pol;

	down_write(&oldmm->mmap_sem);
	flush_cache_mm(current->mm);
	mm->locked_vm = 0;
	mm->mmap = NULL;
	mm->mmap_cache = NULL;
	mm->free_area_cache = oldmm->mmap_base;
	mm->map_count = 0;
	mm->rss = 0;
	mm->anon_rss = 0;
	cpus_clear(mm->cpu_vm_mask);
	mm->mm_rb = RB_ROOT;
	rb_link = &mm->mm_rb.rb_node;
	rb_parent = NULL;
	pprev = &mm->mmap;

	/**
	 * 复制父进程的每一个vm_area_struct线性区描述符，并把复制品插入到子进程的线性区链表和红黑树中。
	 */
	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
		struct file *file;

		if (mpnt->vm_flags & VM_DONTCOPY) {
			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
							-vma_pages(mpnt));
			continue;
		}
		charge = 0;
		if (mpnt->vm_flags & VM_ACCOUNT) {
			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
			if (security_vm_enough_memory(len))
				goto fail_nomem;
			charge = len;
		}
		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
		if (!tmp)
			goto fail_nomem;
		*tmp = *mpnt;
		pol = mpol_copy(vma_policy(mpnt));
		retval = PTR_ERR(pol);
		if (IS_ERR(pol))
			goto fail_nomem_policy;
		vma_set_policy(tmp, pol);
		tmp->vm_flags &= ~VM_LOCKED;
		tmp->vm_mm = mm;
		tmp->vm_next = NULL;
		anon_vma_link(tmp);
		file = tmp->vm_file;
		if (file) {
			struct inode *inode = file->f_dentry->d_inode;
			get_file(file);
			if (tmp->vm_flags & VM_DENYWRITE)
				atomic_dec(&inode->i_writecount);
      
			/* insert tmp into the share list, just after mpnt */
			spin_lock(&file->f_mapping->i_mmap_lock);
			tmp->vm_truncate_count = mpnt->vm_truncate_count;
			flush_dcache_mmap_lock(file->f_mapping);
			vma_prio_tree_add(tmp, mpnt);
			flush_dcache_mmap_unlock(file->f_mapping);
			spin_unlock(&file->f_mapping->i_mmap_lock);
		}

		/*
		 * Link in the new vma and copy the page table entries:
		 * link in first so that swapoff can see swap entries,
		 * and try_to_unmap_one's find_vma find the new vma.
		 */
		spin_lock(&mm->page_table_lock);
		*pprev = tmp;
		pprev = &tmp->vm_next;

		__vma_link_rb(mm, tmp, rb_link, rb_parent);
		rb_link = &tmp->vm_rb.rb_right;
		rb_parent = &tmp->vm_rb;

		mm->map_count++;
		/**
		 * copy_page_range创建必要的页表来映射线性区所包含的一组页。并且初始化新页表的表项。
		 * 对私有、可写的页（无VM_SHARED标志，有VM_MAYWRITE标志），对父子进程都标记为只读的。
		 * 为写时复制进行处理。
		 */
		retval = copy_page_range(mm, current->mm, tmp);
		spin_unlock(&mm->page_table_lock);

		if (tmp->vm_ops && tmp->vm_ops->open)
			tmp->vm_ops->open(tmp);

		if (retval)
			goto out;
	}
	retval = 0;

out:
	flush_tlb_mm(current->mm);
	up_write(&oldmm->mmap_sem);
	return retval;
fail_nomem_policy:
	kmem_cache_free(vm_area_cachep, tmp);
fail_nomem:
	retval = -ENOMEM;
	vm_unacct_memory(charge);
	goto out;
}

4、复制进程的内核栈：copy_thread()

调用cpoy_thread，用调用do_fork时CPU寄存器的值（它们还保存在父进程的内核栈中）来初始化子进程的内核栈。不过，copy_thread把eax寄存器对应字段的值（fork子进程中的返回值）设置为0。子进程描述符的thread.esp字段初始化为子进程内核栈的基地址ret_from_fork的地址存放在thread.eip中。

 int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
	unsigned long unused,
	struct task_struct * p, struct pt_regs * regs)
{
	struct pt_regs * childregs;
	struct task_struct *tsk;
	int err;
 
	childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1;
	*childregs = *regs;
	childregs->eax = 0;
	childregs->esp = esp;
 
	p->thread.esp = (unsigned long) childregs;
	p->thread.esp0 = (unsigned long) (childregs+1);
 
	p->thread.eip = (unsigned long) ret_from_fork;
 
	savesegment(fs,p->thread.fs);
	savesegment(gs,p->thread.gs);
 
	tsk = current;
	if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
		if (!p->thread.io_bitmap_ptr) {
			p->thread.io_bitmap_max = 0;
			return -ENOMEM;
		}
		memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
			IO_BITMAP_BYTES);
	}
 
	/*
	 * Set a new TLS for the child thread?
	 */
	if (clone_flags & CLONE_SETTLS) {
		struct desc_struct *desc;
		struct user_desc info;
		int idx;
 
		err = -EFAULT;
		if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
			goto out;
		err = -EINVAL;
		if (LDT_empty(&info))
			goto out;
 
		idx = info.entry_number;
		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
			goto out;
 
		desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
		desc->a = LDT_entry_a(&info);
		desc->b = LDT_entry_b(&info);
	}
 
	err = 0;
}