fork之旅

前文：本文以x86_64架构的代码作为参考

fork之旅

初探

编码过程中，我们想要使用linux的系统调用，就需要进行这样一条include

#include <unistd.h>

这个文件位于${LINUX_SOURCE}/include/asm-\${arch}/目录下。内容大概分文以下几个部分

1.系统调用号的定义

根据操作系统课程设计的经验，用户态进程在系统调用阶段能完成的工作仅仅是陷入内核态，然后由内核完成工作。那么内核具体需要完成的功能就由中断号指定，这里能看到许多非常熟悉的名字

#define __NR_read                                0
__SYSCALL(__NR_read, sys_read)
#define __NR_write                               1
__SYSCALL(__NR_write, sys_write)
#define __NR_open                                2
__SYSCALL(__NR_open, sys_open)
#define __NR_close                               3
...
#define __NR_vserver                        236
__SYSCALL(__NR_vserver, sys_ni_syscall)

#define __NR_syscall_max __NR_vserver

__SYSCALL这个宏出自$LINUX_SOURCE/arch/x86_64/kernel/syscall.c，定义比较奇怪

#define __NO_STUBS

#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
#undef _ASM_X86_64_UNISTD_H_
#include <asm-x86_64/unistd.h>

#undef __SYSCALL
#define __SYSCALL(nr, sym) [ nr ] = sym, 
#undef _ASM_X86_64_UNISTD_H_

typedef void (*sys_call_ptr_t)(void); 

extern void sys_ni_syscall(void);

sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 
    /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
    [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm-x86_64/unistd.h>
};

怎么看都能看出来这是定义了两遍，根据我的理解，上面的定义是通用的定义，而下面的定义是针对x86_64架构的定义。根据上面unistd.h中的内容，正好能对上号：

系统调用表（sys_call_table）是一个函数指针的数组
按照宏定义，所有__SYSCALL宏中的内容都会被改写为lambda，就像这个数组中预定义的第一个元素一样
数组静态定义中直接include操作的确暴力，不过最终真的，满足语法

这样的定义造成了如下结果：

所有lambda都被预设至sys_ni_syscall
部分lambda被重新定向

这里sys_ni_syscall是如此定义的，在${LINUX_SOURCE}/kernel/sys.c中

asmlinkage long sys_ni_syscall(void)
{
    return -ENOSYS;
}

这个意思很明显，就是返回”无效系统调用“这个错误。

综上所述，这一部分的作用就是定义系统调用表，采用了巧妙的设计：

不同体编译参数可能有不同的系统调用，实现途径就是根据不同的宏，选择不同架构的unistd.h，装入系统调用表
系统调用表是长的，但是系统调用的数量可变，预先的lambda定义使得任意内核在使用任何合法范围内（273）的系统调用时，都不会出现不可预测的情况

2.系统调用的内核视图

什么意思呢？意思是这段宏定义的内容根本不是给人看的。。。

#ifndef __NO_STUBS

/* user-visible error numbers are in the range -1 - -4095 */

#define __syscall_clobber "r11","rcx","memory" 

#define __syscall_return(type, res) \
do { \
    if ((unsigned long)(res) >= (unsigned long)(-127)) { \
        errno = -(res); \
        res = -1; \
    } \
    return (type) (res); \
} while (0)

#ifndef __KERNEL_SYSCALLS__

#define __syscall "syscall"

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name) : __syscall_clobber ); \
__syscall_return(type,__res); \
}

#define _syscall1(type,name,type1,arg1) \
type name(type1 arg1) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)) : __syscall_clobber ); \
__syscall_return(type,__res); \
}

#define _syscall2(type,name,type1,arg1,type2,arg2) \
type name(type1 arg1,type2 arg2) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \
__syscall_return(type,__res); \
}

#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
type name(type1 arg1,type2 arg2,type3 arg3) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
          "d" ((long)(arg3)) : __syscall_clobber); \
__syscall_return(type,__res); \
}

#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ;" __syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
      "d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \
__syscall_return(type,__res); \
} 

#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
      type5,arg5) \
type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
      "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) : \
    __syscall_clobber,"r8","r10" ); \
__syscall_return(type,__res); \
}

#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
      type5,arg5,type6,arg6) \
type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9" __syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
      "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5), \
      "g" ((long)(arg6),) : \
    __syscall_clobber,"r8","r10","r9" ); \
__syscall_return(type,__res); \
}

虽然艰涩难懂，还是简要梳理以下

这块代码都是被宏__NO_STUBS包着，而这个东西就是在上面syscall.c中定义的，所以，这些东西都不会被放到syscall.c里面
这一块代码在__KERNEL_SYSCALLS__这个宏的控制范围下的内容，根据我的理解，是系统调用对内核的接口。原因很简单：都是一些嵌入式汇编，一看就是要和模式切换，中断开关打交道
这些宏的作用比较容易看出来：就是参数列表长度不同的系统调用的接口

3.系统调用的用户接口

这一部分比起上一部分友好了很多，一看明白

#else /* __KERNEL_SYSCALLS__ */

/*
 * we need this inline - forking from kernel space will result
 * in NO COPY ON WRITE (!!!), until an execve is executed. This
 * is no problem, but for the stack. This is handled by not letting
 * main() use the stack at all after fork(). Thus, no function
 * calls - which means inline code for fork too, as otherwise we
 * would use the stack upon exit from 'fork()'.
 *
 * Actually only pause and fork are needed inline, so that there
 * won't be any messing with the stack from main(), but we define
 * some others too.
 */
#define __NR__exit __NR_exit

extern pid_t sys_setsid(void);
static inline pid_t setsid(void)
{
    return sys_setsid();
}

long sys_write(int fd, const char *buf, size_t size);
static inline ssize_t write(unsigned int fd, char * buf, size_t count)
{
    return sys_write(fd, buf, count);
}

extern ssize_t sys_read(unsigned int, char *, size_t);
static inline ssize_t read(unsigned int fd, char * buf, size_t count)
{
    return sys_read(fd, buf, count);
}

extern off_t sys_lseek(unsigned int, off_t, unsigned int);
static inline off_t lseek(unsigned int fd, off_t offset, unsigned int origin)
{
    return sys_lseek(fd, offset, origin);
}

extern long sys_dup(unsigned int);
static inline long dup(unsigned int fd)
{
    return sys_dup(fd);
}

/* implemented in asm in arch/x86_64/kernel/entry.S */
extern long execve(char *, char **, char **);

extern long sys_open(const char *, int, int);
static inline long open(const char * filename, int flags, int mode)
{
    return sys_open(filename, flags, mode);
}

extern long sys_close(unsigned int);
static inline long close(unsigned int fd)
{
    return sys_close(fd);
}

extern long sys_exit(int) __attribute__((noreturn));
extern inline long exit(int error_code)
{
    sys_exit(error_code);
}

struct rusage; 
long sys_wait4(pid_t pid,unsigned int * stat_addr, 
            int options, struct rusage * ru);
static inline pid_t waitpid(int pid, int * wait_stat, int flags)
{
    return sys_wait4(pid, wait_stat, flags, NULL);
}

#endif /* __KERNEL_SYSCALLS__ */

#endif /* __NO_STUBS */

/*
 * "Conditional" syscalls
 *
 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
 * but it doesn't work on all toolchains, so we just do it by hand
 */
#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");

#endif

简要分析：

看起来，这里的所有系统调用函数的本体是分布在不同模块的代码中的，这里全部都是extern定义
另外，这里出现了我们最熟悉的系统调用的定义，而这一系列不过都是转发，真正的操作是由第二部分提到的接口进入内核，由内核完成的

总结

这一部分和fork还不太沾边，但至少我们知道了当我们调用fork的时候，究竟发生了什么？都是定义在C源文件里的函数，系统调用和一般API有什么区别？

下面我们将进一步深入，看看内核态下，fork都完成了什么工作

起点&终点

在之前的unistd.h中，我们在系统调用表中找到这样一行：

...
#define __NR_fork                               57
__SYSCALL(__NR_fork, stub_fork)
...

fork的系统调用号被接到了stub_fork函数上，这个函数。通览全文，stub_fork唯一的定义在：

${LINUX_SOURCE}\arch\x86_64\kernel\entry.S

    .macro PTREGSCALL label,func
    .globl \label
\label:
    leaq    \func(%rip),%rax
    jmp ptregscall_common
    .endm

    PTREGSCALL stub_clone, sys_clone
    PTREGSCALL stub_fork, sys_fork
    PTREGSCALL stub_vfork, sys_vfork
    PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend
    PTREGSCALL stub_sigaltstack, sys_sigaltstack
    PTREGSCALL stub_iopl, sys_iopl

ENTRY(ptregscall_common)
    popq %r11
    SAVE_REST
    movq %r11, %r15
    FIXUP_TOP_OF_STACK %r11
    call *%rax
    RESTORE_TOP_OF_STACK %r11
    movq %r15, %r11
    RESTORE_REST
    pushq %r11
    ret

而entry.S是系统调用入口，一切的起点

/*
 * entry.S contains the system-call and fault low-level handling routines.
 *
 * NOTE: This code handles signal-recognition, which happens every time
 * after an interrupt and after each system call.
 * 
 * Normal syscalls and interrupts don't save a full stack frame, this is 
 * only done for syscall tracing, signals or fork/exec et.al.
 * 
 * A note on terminology:    
 * - top of stack: Architecture defined interrupt frame from SS to RIP 
 * at the top of the kernel process stack.  
 * - partial stack frame: partially saved registers upto R11.
 * - full stack frame: Like partial stack frame, but all register saved. 
 *  
 * TODO:     
 * - schedule it carefully for the final hardware.
 */

一上来就有这样一段注释，其中说明了一些需要保存全部堆栈帧的系统调用，fork就在其列。

紧接着就是大量的汇编，其中有一个宏ENTRY，定义在$LINUX_SOURCE/include/linkage.h中：

#define __ALIGN     .align 4,0x90
...
#define ENTRY(name) \
  .globl name; \
  ALIGN; \
  name:

.globl的意思应该是说将这个标签置为全局可见，所以说其他文件中的代码可以直接跳到ENTRY处

回过头来看上面包含stub_fork的部分，意思就比较清晰了：调用stub_fork的过程本质上是调用用sys_fork这个地址的的函数，只不过前面后面多加了一些栈操作。

风景区

不纠结于着大量的汇编，我们在${LINUX_SOURCE}\arch\x86_64\kernel\process.c处找到了sys_fork和他的好朋友们：

asmlinkage long sys_fork(struct pt_regs regs)
{
    return do_fork(SIGCHLD, regs.rsp, &regs, 0, NULL, NULL);
}

asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void *parent_tid, void *child_tid, struct pt_regs regs)
{
    if (!newsp)
        newsp = regs.rsp;
    return do_fork(clone_flags & ~CLONE_IDLETASK, newsp, &regs, 0, 
            parent_tid, child_tid);
}

/*
 * This is trivial, and on the face of it looks like it
 * could equally well be done in user mode.
 *
 * Not so, for quite unobvious reasons - register pressure.
 * In user mode vfork() cannot have a stack frame, and if
 * done by calling the "clone()" system call directly, you
 * do not have enough call-clobbered registers to hold all
 * the information you need.
 */
asmlinkage long sys_vfork(struct pt_regs regs)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, &regs, 0, 
            NULL, NULL);
}

这个样子基本上就清楚了，这一系列和进程产生有关的函数，全部由do_fork这个函数实现。

而这个函数就是正了八经声明在sched.h中，实现在fork.c中的

/* fork.c */
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long pid;

    if (unlikely(current->ptrace)) {
        trace = fork_traceflag (clone_flags);
        if (trace)
            clone_flags |= CLONE_PTRACE;
    }

    p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    pid = IS_ERR(p) ? PTR_ERR(p) : p->pid;

    if (!IS_ERR(p)) {
        struct completion vfork;

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        p->state = TASK_STOPPED;
        if (!(clone_flags & CLONE_STOPPED))
            wake_up_forked_process(p);  /* do this last */
        ++total_forks;

        if (unlikely (trace)) {
            current->ptrace_message = pid;
            ptrace_notify ((trace << 8) | SIGTRAP);
        }

        if (clone_flags & CLONE_VFORK) {
            wait_for_completion(&vfork);
            if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
        } else
            /*
             * Let the child process run first, to avoid most of the
             * COW overhead when the child exec()s afterwards.
             */
            set_need_resched();
    }
    return pid;
}

看一眼函数注释，我们就知道找对了地方。

1.do_fork参数分析

首先是调用参数的解析，结合上面fork系列函数的定义，我们可以分析出do_fork各个参数的意义

clone_flags：定义了do_fork操作中需要复制的内容，定义在sched.h中：

/*
* cloning flags:
*/

#define CSIGNAL       0x000000ff  /* signal mask to be sent at exit */


#define CLONE_VM  0x00000100  /* set if VM shared between processes */


#define CLONE_FS  0x00000200  /* set if fs info shared between processes */


#define CLONE_FILES   0x00000400  /* set if open files shared between processes */


#define CLONE_SIGHAND 0x00000800  /* set if signal handlers and blocked signals shared */


#define CLONE_IDLETASK    0x00001000  /* set if new pid should be 0 (kernel only)*/


#define CLONE_PTRACE  0x00002000  /* set if we want to let tracing continue on the child too */


#define CLONE_VFORK   0x00004000  /* set if the parent wants the child to wake it up on mm_release */


#define CLONE_PARENT  0x00008000  /* set if we want to have the same parent as the cloner */


#define CLONE_THREAD  0x00010000  /* Same thread group? */


#define CLONE_NEWNS   0x00020000  /* New namespace group? */


#define CLONE_SYSVSEM 0x00040000  /* share system V SEM_UNDO semantics */


#define CLONE_SETTLS  0x00080000  /* create a new TLS for the child */


#define CLONE_PARENT_SETTID   0x00100000  /* set the TID in the parent */


#define CLONE_CHILD_CLEARTID  0x00200000  /* clear the TID in the child */


#define CLONE_DETACHED        0x00400000  /* Not used - CLONE_THREAD implies detached uniquely */


#define CLONE_UNTRACED        0x00800000  /* set if the tracing process can't force CLONE_PTRACE on this clone */


#define CLONE_CHILD_SETTID    0x01000000  /* set the TID in the child */


#define CLONE_STOPPED     0x02000000  /* Start in stopped state */

意思都非常明确，注释说的一清二楚，那么我们回过头，看看fork，vfork和clone分别用了什么标识：

sys_fork：SIGCHLD标识定义在signal.h中，值为17，并不在上述flag中，我们可以认为它什么特殊操作也没有采用
sys_clone：这个操作基本上就是对do_fork进行了一个简单的转接，但是用户输入的flag中不能够包括CLONE_IDLETASK这个标识
sys_vfork:在sys_fork的基础上增加了CLONE_VFORK，CLONE_VM两个标识VFORK指定这次frok行为是VFORK，在收到子进程退出信号之前父进程会阻塞；VM标识则说明子进程和父进程共享同一内存空间

stack_start：按照字面意思就是栈的起点，看看三个fork和vfork，给出的值都是regs.rsp，意思就是堆栈指针寄存器，而clone给出的是用户输入的参数newsp。者就告诉我们如果想要使用clone，需要手动申请内核栈
regs：这个意思也非常明显，看看参数类型就明白了——寄存器指针
stack_size：意思是栈大小，然而很有意思的是fork系列中该参数始终是0，==让人摸不着头脑==
parent/child_tid：虽然搞不明白tid是什么，但是前面那个__user宏能说明很多问题，定义在${LINUX_SOURCE}/include/compiler.h中：
```
# define __user       __attribute__((noderef, address_space(1)))


# define __kernel /* default address space */
```
address_space这个名词非常的可疑，考虑以下执行这段代码的进程，他现在处于的是内核空间，所以__kernel这个宏什么都没有，而__这个宏却有一些让人看不懂的东西。估计目的是说明这地址位于用户空间，而不是内核空间

2.do_fork内容分析

至此，参数我们已经分析完毕了，下面我们来仔细看看do_fork都做了些什么：

变量准备

用了这样一段代码，完成了需要贯穿整个函数体的变量的初始化工作：

struct task_struct *p;
    int trace = 0;
    long pid;

    if (unlikely(current->ptrace)) {
        trace = fork_traceflag (clone_flags);
        if (trace)
            clone_flags |= CLONE_PTRACE;
    }

其中if代码块表名子进程根据父进程的要求设定自己的追踪信息标识，具体逻辑定义在fork.c的fork_traceflag中，如下：

static inline int fork_traceflag (unsigned clone_flags)
{
    if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK))
        return 0;
    else if (clone_flags & CLONE_VFORK) {
        if (current->ptrace & PT_TRACE_VFORK)
            return PTRACE_EVENT_VFORK;
    } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
        if (current->ptrace & PT_TRACE_CLONE)
            return PTRACE_EVENT_CLONE;
    } else if (current->ptrace & PT_TRACE_FORK)
        return PTRACE_EVENT_FORK;

    return 0;
}

PCB的初始化

接下来通过copy_process函数，构造新的PCB，并把指针返回给变量p：

p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);

copy_process的实现也在fork.c中，内容相当丰富：

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
struct task_struct *copy_process(unsigned long clone_flags,
                 unsigned long stack_start,
                 struct pt_regs *regs,
                 unsigned long stack_size,
                 int __user *parent_tidptr,
                 int __user *child_tidptr)
{
    int retval;
    struct task_struct *p = NULL;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    /*
     * CLONE_DETACHED must match CLONE_THREAD: it's a historical
     * thing.
     */
    if (!(clone_flags & CLONE_DETACHED) != !(clone_flags & CLONE_THREAD)) {
        /* Warn about the old no longer supported case so that we see it */
        if (clone_flags & CLONE_THREAD) {
            static int count;
            if (count < 5) {
                count++;
                printk(KERN_WARNING "%s trying to use CLONE_THREAD without CLONE_DETACH\n", current->comm);
            }
        }
        return ERR_PTR(-EINVAL);
    }

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    retval = -EAGAIN;
    if (atomic_read(&p->user->processes) >=
            p->rlim[RLIMIT_NPROC].rlim_cur) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                p->user != &root_user)
            goto bad_fork_free;
    }

    atomic_inc(&p->user->__count);
    atomic_inc(&p->user->processes);

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(p->thread_info->exec_domain->module))
        goto bad_fork_cleanup_count;

    if (p->binfmt && !try_module_get(p->binfmt->module))
        goto bad_fork_cleanup_put_domain;

#ifdef CONFIG_PREEMPT
    /*
     * schedule_tail drops this_rq()->lock so we compensate with a count
     * of 1.  Also, we want to start with kernel preemption disabled.
     */
    p->thread_info->preempt_count = 1;
#endif
    p->did_exec = 0;
    p->state = TASK_UNINTERRUPTIBLE;

    copy_flags(clone_flags, p);
    if (clone_flags & CLONE_IDLETASK)
        p->pid = 0;
    else {
        p->pid = alloc_pidmap();
        if (p->pid == -1)
            goto bad_fork_cleanup;
    }
    retval = -EFAULT;
    if (clone_flags & CLONE_PARENT_SETTID)
        if (put_user(p->pid, parent_tidptr))
            goto bad_fork_cleanup;

    p->proc_dentry = NULL;

    INIT_LIST_HEAD(&p->run_list);

    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    INIT_LIST_HEAD(&p->posix_timers);
    init_waitqueue_head(&p->wait_chldexit);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);
    spin_lock_init(&p->switch_lock);
    spin_lock_init(&p->proc_lock);

    clear_tsk_thread_flag(p, TIF_SIGPENDING);
    init_sigpending(&p->pending);

    p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
    p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
    init_timer(&p->real_timer);
    p->real_timer.data = (unsigned long) p;

    p->leader = 0;      /* session leadership doesn't inherit */
    p->tty_old_pgrp = 0;
    p->utime = p->stime = 0;
    p->cutime = p->cstime = 0;
    p->array = NULL;
    p->lock_depth = -1;     /* -1 = no lock */
    p->start_time = get_jiffies_64();
    p->security = NULL;
    p->io_context = NULL;

    retval = -ENOMEM;
    if ((retval = security_task_alloc(p)))
        goto bad_fork_cleanup;
    /* copy all the process information */
    if ((retval = copy_semundo(clone_flags, p)))
        goto bad_fork_cleanup_security;
    if ((retval = copy_files(clone_flags, p)))
        goto bad_fork_cleanup_semundo;
    if ((retval = copy_fs(clone_flags, p)))
        goto bad_fork_cleanup_files;
    if ((retval = copy_sighand(clone_flags, p)))
        goto bad_fork_cleanup_fs;
    if ((retval = copy_signal(clone_flags, p)))
        goto bad_fork_cleanup_sighand;
    if ((retval = copy_mm(clone_flags, p)))
        goto bad_fork_cleanup_signal;
    if ((retval = copy_namespace(clone_flags, p)))
        goto bad_fork_cleanup_mm;
    retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_namespace;

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;

    /*
     * Syscall tracing should be turned off in the child regardless
     * of CLONE_PTRACE.
     */
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);

    /* Our parent execution domain becomes current domain
       These must match for thread signalling to apply */

    p->parent_exec_id = p->self_exec_id;

    /* ok, now we should be set up.. */
    p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
    p->pdeath_signal = 0;

    /*
     * Share the timeslice between parent and child, thus the
     * total amount of pending timeslices in the system doesn't change,
     * resulting in more scheduling fairness.
     */
    local_irq_disable();
        p->time_slice = (current->time_slice + 1) >> 1;
    /*
     * The remainder of the first timeslice might be recovered by
     * the parent if the child exits early enough.
     */
    p->first_time_slice = 1;
    current->time_slice >>= 1;
    p->timestamp = sched_clock();
    if (!current->time_slice) {
        /*
         * This case is rare, it happens when the parent has only
         * a single jiffy left from its timeslice. Taking the
         * runqueue lock is not a problem.
         */
        current->time_slice = 1;
        preempt_disable();
        scheduler_tick(0, 0);
        local_irq_enable();
        preempt_enable();
    } else
        local_irq_enable();
    /*
     * Ok, add it to the run-queues and make it
     * visible to the rest of the system.
     *
     * Let it rip!
     */
    p->tgid = p->pid;
    p->group_leader = p;
    INIT_LIST_HEAD(&p->ptrace_children);
    INIT_LIST_HEAD(&p->ptrace_list);

    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);
    /*
     * Check for pending SIGKILL! The new thread should not be allowed
     * to slip out of an OOM kill. (or normal SIGKILL.)
     */
    if (sigismember(&current->pending.signal, SIGKILL)) {
        write_unlock_irq(&tasklist_lock);
        retval = -EINTR;
        goto bad_fork_cleanup_namespace;
    }

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & CLONE_PARENT)
        p->real_parent = current->real_parent;
    else
        p->real_parent = current;
    p->parent = p->real_parent;

    if (clone_flags & CLONE_THREAD) {
        spin_lock(&current->sighand->siglock);
        /*
         * Important: if an exit-all has been started then
         * do not create this new thread - the whole thread
         * group is supposed to exit anyway.
         */
        if (current->signal->group_exit) {
            spin_unlock(&current->sighand->siglock);
            write_unlock_irq(&tasklist_lock);
            retval = -EAGAIN;
            goto bad_fork_cleanup_namespace;
        }
        p->tgid = current->tgid;
        p->group_leader = current->group_leader;

        if (current->signal->group_stop_count > 0) {
            /*
             * There is an all-stop in progress for the group.
             * We ourselves will stop as soon as we check signals.
             * Make the new thread part of that group stop too.
             */
            current->signal->group_stop_count++;
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        spin_unlock(&current->sighand->siglock);
    }

    SET_LINKS(p);
    if (p->ptrace & PT_PTRACED)
        __ptrace_link(p, current->parent);

    attach_pid(p, PIDTYPE_PID, p->pid);
    if (thread_group_leader(p)) {
        attach_pid(p, PIDTYPE_TGID, p->tgid);
        attach_pid(p, PIDTYPE_PGID, process_group(p));
        attach_pid(p, PIDTYPE_SID, p->session);
        if (p->pid)
            __get_cpu_var(process_counts)++;
    } else
        link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);

    nr_threads++;
    write_unlock_irq(&tasklist_lock);
    retval = 0;

fork_out:
    if (retval)
        return ERR_PTR(retval);
    return p;

bad_fork_cleanup_namespace:
    exit_namespace(p);
bad_fork_cleanup_mm:
    exit_mm(p);
bad_fork_cleanup_signal:
    exit_signal(p);
bad_fork_cleanup_sighand:
    exit_sighand(p);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_security:
    security_task_free(p);
bad_fork_cleanup:
    if (p->pid > 0)
        free_pidmap(p->pid);
    if (p->binfmt)
        module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
    module_put(p->thread_info->exec_domain->module);
bad_fork_cleanup_count:
    atomic_dec(&p->user->processes);
    free_uid(p->user);
bad_fork_free:
    free_task(p);
    goto fork_out;
}

我们把这些内容切分成几块来看:

错误检查：验证参数是否有效，由于这里是内核，出现了异常参数就不是段错误那么简单的问题了，所以必须严格检查：

if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
    return ERR_PTR(-EINVAL);

/*
 * Thread groups must share signals as well, and detached threads
 * can only be started up within the thread group.
 */
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
    return ERR_PTR(-EINVAL);

/*
 * Shared signal handlers imply shared VM. By way of the above,
 * thread groups also imply shared VM. Blocking this case allows
 * for various simplifications in other code.
 */
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
    return ERR_PTR(-EINVAL);

/*
 * CLONE_DETACHED must match CLONE_THREAD: it's a historical
 * thing.
 */
if (!(clone_flags & CLONE_DETACHED) != !(clone_flags & CLONE_THREAD)) {
    /* Warn about the old no longer supported case so that we see it */
    if (clone_flags & CLONE_THREAD) {
        static int count;
        if (count < 5) {
            count++;
            printk(KERN_WARNING "%s trying to use CLONE_THREAD without CLONE_DETACH\n", current->comm);
        }
    }
    return ERR_PTR(-EINVAL);
}

其中，ERR_PTR这个函数定义在${LINUX_SOURCE}/include/linux/err.h中，内容十分简单

static inline void *ERR_PTR(long error)
{
return (void *) error;
}

将错误号转化成指针类型返回

而printk是内核中使用的printf，功能基本上相近，此处不再展开

初始化新的数据结构，内容复制，检查进程数是否超限，更新用户信息

接下来的工作就像上面描述的顺序一样：

retval = security_task_create(clone_flags);
if (retval)
    goto fork_out;

retval = -ENOMEM;
p = dup_task_struct(current);
if (!p)
    goto fork_out;

retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
        p->rlim[RLIMIT_NPROC].rlim_cur) {
    if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
            p->user != &root_user)
        goto bad_fork_free;
}

atomic_inc(&p->user->__count);
atomic_inc(&p->user->processes);

同样，也是每个操作后都要检查合法性

security_create函数是对一个“方法”的转发，定义在${LINUX_SOURCE}/include/linux/security.h中，内容如下：
```
static inline int security_task_create (unsigned long clone_flags)
{
return security_ops->task_create (clone_flags);
}
```
而security_ops这个结构体也定义在这个文件当中，成员全部是函数指针

dup_task_struct：真正完成了新PCB的创建，并复制了父进程（当前进程）PCB的内容，如下：

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;

prepare_to_copy(orig);

tsk = alloc_task_struct();
if (!tsk)
    return NULL;

ti = alloc_thread_info(tsk);
if (!ti) {
    free_task_struct(tsk);
    return NULL;
}

*ti = *orig->thread_info;
*tsk = *orig;
tsk->thread_info = ti;
ti->task = tsk;

/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
return tsk;
}

prepare_to_copy函数定义在process.c中，对父进程的PCB做了一系列置位操作
alloc_task_struct函数在slab中为申请一个新的PCB空间；同样的alloc_thread_info也在对应的slab中为申请了一块新的空间。而对应的如果这些过程出现问题，就直接通过free_task_struct释放掉申请的空间

检查进程数是否溢出并检查模块可用性

/*
 * If multiple threads are within copy_process(), then this check
 * triggers too late. This doesn't hurt, the check is only there
 * to stop root fork bombs.
 */
if (nr_threads >= max_threads)
    goto bad_fork_cleanup_count;

if (!try_module_get(p->thread_info->exec_domain->module))
    goto bad_fork_cleanup_count;

if (p->binfmt && !try_module_get(p->binfmt->module))
    goto bad_fork_cleanup_put_domain;

这里的max_thread是一个全局变量，赋值在fork_init函数中，如下：

max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;

看起来是根据内存大小决定的。

完成新PCB标志的复制与设置：


#ifdef CONFIG_PREEMPT

/*
 * schedule_tail drops this_rq()->lock so we compensate with a count
 * of 1.  Also, we want to start with kernel preemption disabled.
 */
p->thread_info->preempt_count = 1;

#endif

p->did_exec = 0;
p->state = TASK_UNINTERRUPTIBLE;

copy_flags(clone_flags, p);
if (clone_flags & CLONE_IDLETASK)
    p->pid = 0;
else {
    p->pid = alloc_pidmap();
    if (p->pid == -1)
        goto bad_fork_cleanup;
}
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
    if (put_user(p->pid, parent_tidptr))
        goto bad_fork_cleanup;

进程同步机制、元数据的初始化：

p->proc_dentry = NULL;

INIT_LIST_HEAD(&p->run_list);

INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
INIT_LIST_HEAD(&p->posix_timers);
init_waitqueue_head(&p->wait_chldexit);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
spin_lock_init(&p->switch_lock);
spin_lock_init(&p->proc_lock);

clear_tsk_thread_flag(p, TIF_SIGPENDING);
init_sigpending(&p->pending);

p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;

p->leader = 0;      /* session leadership doesn't inherit */
p->tty_old_pgrp = 0;
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->array = NULL;
p->lock_depth = -1;     /* -1 = no lock */
p->start_time = get_jiffies_64();
p->security = NULL;
p->io_context = NULL;

根据clone_flags为子进程的各个描述符复制：

retval = -ENOMEM;
if ((retval = security_task_alloc(p)))
    goto bad_fork_cleanup;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
    goto bad_fork_cleanup_security;
if ((retval = copy_files(clone_flags, p)))
    goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
    goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
    goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
    goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
    goto bad_fork_cleanup_signal;
if ((retval = copy_namespace(clone_flags, p)))
    goto bad_fork_cleanup_mm;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
    goto bad_fork_cleanup_namespace;

这里出现的copy函数一般都采用如下策略：

clone_flags中设置了相关的共享标识，则直接将父进程（当前进程）的相关描述符的地址返回
否则，为子进程重新建立一个描述符

父进程将自己的时间片分给子进程

/*
 * Share the timeslice between parent and child, thus the
 * total amount of pending timeslices in the system doesn't change,
 * resulting in more scheduling fairness.
 */
local_irq_disable();
       p->time_slice = (current->time_slice + 1) >> 1;
/*
 * The remainder of the first timeslice might be recovered by
 * the parent if the child exits early enough.
 */
p->first_time_slice = 1;
current->time_slice >>= 1;
p->timestamp = sched_clock();
if (!current->time_slice) {
    /*
     * This case is rare, it happens when the parent has only
     * a single jiffy left from its timeslice. Taking the
     * runqueue lock is not a problem.
     */
    current->time_slice = 1;
    preempt_disable();
    scheduler_tick(0, 0);
    local_irq_enable();
    preempt_enable();
} else
    local_irq_enable();

涉及到时间片的分割，首先上来就把中断关了。local_irq_enable和local_irq_disable都是宏，内容是汇编的清位和置位
子进程如果结束的够快（一个时间片之内），父进程可以将first_time_slice域中的时间收回来。这是从注释中获得的信息，具体实现还要看进程结束的逻辑
如果父进程没有时间片了，就再给父进程一个时间片
打开中断

将新进程加入进程树

/*
 * Ok, add it to the run-queues and make it
 * visible to the rest of the system.
 *
 * Let it rip!
 */
p->tgid = p->pid;
p->group_leader = p;
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);

/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/*
 * Check for pending SIGKILL! The new thread should not be allowed
 * to slip out of an OOM kill. (or normal SIGKILL.)
 */
if (sigismember(&current->pending.signal, SIGKILL)) {
    write_unlock_irq(&tasklist_lock);
    retval = -EINTR;
    goto bad_fork_cleanup_namespace;
}

/* CLONE_PARENT re-uses the old parent */
if (clone_flags & CLONE_PARENT)
    p->real_parent = current->real_parent;
else
    p->real_parent = current;
p->parent = p->real_parent;

if (clone_flags & CLONE_THREAD) {
    spin_lock(&current->sighand->siglock);
    /*
     * Important: if an exit-all has been started then
     * do not create this new thread - the whole thread
     * group is supposed to exit anyway.
     */
    if (current->signal->group_exit) {
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        retval = -EAGAIN;
        goto bad_fork_cleanup_namespace;
    }
    p->tgid = current->tgid;
    p->group_leader = current->group_leader;

    if (current->signal->group_stop_count > 0) {
        /*
         * There is an all-stop in progress for the group.
         * We ourselves will stop as soon as we check signals.
         * Make the new thread part of that group stop too.
         */
        current->signal->group_stop_count++;
        set_tsk_thread_flag(p, TIF_SIGPENDING);
    }

    spin_unlock(&current->sighand->siglock);
}

由于CLONE_PARENT的存在，新进程的父进程到底是谁还是个问题：如果该标识设置，那么父进程应该是当前进程的父进程。
同时，如果子进程设置了CLONE_THREAD标识，那么子进程将加入父进程的线程组

将新的PCB与系统中维护的PID数据结构联系起来

SET_LINKS(p);
if (p->ptrace & PT_PTRACED)
    __ptrace_link(p, current->parent);

attach_pid(p, PIDTYPE_PID, p->pid);
if (thread_group_leader(p)) {
    attach_pid(p, PIDTYPE_TGID, p->tgid);
    attach_pid(p, PIDTYPE_PGID, process_group(p));
    attach_pid(p, PIDTYPE_SID, p->session);
    if (p->pid)
        __get_cpu_var(process_counts)++;
} els
    link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);

nr_threads++;
write_unlock_irq(&tasklist_lock);
retval = 0;

至此，copy_process完成，返回的PCB已经配置完毕，并与系统相关数据结构建立起了联系

收尾工作

等copy_process返回的时候，子进程已经基本建立完成了，下面的工作就是设置子进程状态，然后要求调度器重新调度。然而由于vfork的特殊性，在这个标识下还需要对父进程进行一系列操作

if (!IS_ERR(p)) {
        struct completion vfork;

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        p->state = TASK_STOPPED;
        if (!(clone_flags & CLONE_STOPPED))
            wake_up_forked_process(p);  /* do this last */
        ++total_forks;

        if (unlikely (trace)) {
            current->ptrace_message = pid;
            ptrace_notify ((trace << 8) | SIGTRAP);
        }

        if (clone_flags & CLONE_VFORK) {
            wait_for_completion(&vfork);
            if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
        } else
            /*
             * Let the child process run first, to avoid most of the
             * COW overhead when the child exec()s afterwards.
             */
            set_need_resched();
    }

set_need_resched函数会设置父进程的TIF_NEED_RESCHED标识，下一次始终中断的时候，父进程就会放弃CPU，让新建的子进程执行

总结

一次成功的fork系统调用的流程，可以用下面这张流程图简单描述
这里写图片描述

参考资料&辅助工具

lxr
sublime text 3
CSDN博主JeanCheng的博文

fork之旅

fork之旅

初探

1.系统调用号的定义

2.系统调用的内核视图

3.系统调用的用户接口

总结

起点&终点

风景区

1.do_fork参数分析

2.do_fork内容分析

变量准备

PCB的初始化

收尾工作

总结

参考资料&辅助工具

猜你喜欢