fork之旅

fork之旅

前文:本文以x86_64架构的代码作为参考

初探

编码过程中,我们想要使用linux的系统调用,就需要进行这样一条include

#include <unistd.h>

这个文件位于${LINUX_SOURCE}/include/asm-\${arch}/目录下。内容大概分文以下几个部分

1.系统调用号的定义

根据操作系统课程设计的经验,用户态进程在系统调用阶段能完成的工作仅仅是陷入内核态,然后由内核完成工作。那么内核具体需要完成的功能就由中断号指定,这里能看到许多非常熟悉的名字

#define __NR_read                                0
__SYSCALL(__NR_read, sys_read)
#define __NR_write                               1
__SYSCALL(__NR_write, sys_write)
#define __NR_open                                2
__SYSCALL(__NR_open, sys_open)
#define __NR_close                               3
...
#define __NR_vserver                        236
__SYSCALL(__NR_vserver, sys_ni_syscall)

#define __NR_syscall_max __NR_vserver

__SYSCALL这个宏出自$LINUX_SOURCE/arch/x86_64/kernel/syscall.c,定义比较奇怪

#define __NO_STUBS

#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 
#undef _ASM_X86_64_UNISTD_H_
#include <asm-x86_64/unistd.h>

#undef __SYSCALL
#define __SYSCALL(nr, sym) [ nr ] = sym, 
#undef _ASM_X86_64_UNISTD_H_

typedef void (*sys_call_ptr_t)(void); 

extern void sys_ni_syscall(void);

sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 
    /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 
    [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm-x86_64/unistd.h>
};

怎么看都能看出来这是定义了两遍,根据我的理解,上面的定义是通用的定义,而下面的定义是针对x86_64架构的定义。根据上面unistd.h中的内容,正好能对上号:

  • 系统调用表(sys_call_table)是一个函数指针的数组
  • 按照宏定义,所有__SYSCALL宏中的内容都会被改写为lambda,就像这个数组中预定义的第一个元素一样
  • 数组静态定义中直接include操作的确暴力,不过最终真的,满足语法

这样的定义造成了如下结果:

  • 所有lambda都被预设至sys_ni_syscall
  • 部分lambda被重新定向

这里sys_ni_syscall是如此定义的,在${LINUX_SOURCE}/kernel/sys.c中

asmlinkage long sys_ni_syscall(void)
{
    return -ENOSYS;
}

这个意思很明显,就是返回”无效系统调用“这个错误。

综上所述,这一部分的作用就是定义系统调用表,采用了巧妙的设计:

  • 不同体编译参数可能有不同的系统调用,实现途径就是根据不同的宏,选择不同架构的unistd.h,装入系统调用表
  • 系统调用表是长的,但是系统调用的数量可变,预先的lambda定义使得任意内核在使用任何合法范围内(273)的系统调用时,都不会出现不可预测的情况

2.系统调用的内核视图

什么意思呢?意思是这段宏定义的内容根本不是给人看的。。。

#ifndef __NO_STUBS

/* user-visible error numbers are in the range -1 - -4095 */

#define __syscall_clobber "r11","rcx","memory" 

#define __syscall_return(type, res) \
do { \
    if ((unsigned long)(res) >= (unsigned long)(-127)) { \
        errno = -(res); \
        res = -1; \
    } \
    return (type) (res); \
} while (0)

#ifndef __KERNEL_SYSCALLS__

#define __syscall "syscall"

#define _syscall0(type,name) \
type name(void) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name) : __syscall_clobber ); \
__syscall_return(type,__res); \
}

#define _syscall1(type,name,type1,arg1) \
type name(type1 arg1) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)) : __syscall_clobber ); \
__syscall_return(type,__res); \
}

#define _syscall2(type,name,type1,arg1,type2,arg2) \
type name(type1 arg1,type2 arg2) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)) : __syscall_clobber ); \
__syscall_return(type,__res); \
}

#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \
type name(type1 arg1,type2 arg2,type3 arg3) \
{ \
long __res; \
__asm__ volatile (__syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
          "d" ((long)(arg3)) : __syscall_clobber); \
__syscall_return(type,__res); \
}

#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \
type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ;" __syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
      "d" ((long)(arg3)),"g" ((long)(arg4)) : __syscall_clobber,"r10" ); \
__syscall_return(type,__res); \
} 

#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
      type5,arg5) \
type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
      "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5)) : \
    __syscall_clobber,"r8","r10" ); \
__syscall_return(type,__res); \
}

#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \
      type5,arg5,type6,arg6) \
type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \
{ \
long __res; \
__asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9" __syscall \
    : "=a" (__res) \
    : "0" (__NR_##name),"D" ((long)(arg1)),"S" ((long)(arg2)), \
      "d" ((long)(arg3)),"g" ((long)(arg4)),"g" ((long)(arg5), \
      "g" ((long)(arg6),) : \
    __syscall_clobber,"r8","r10","r9" ); \
__syscall_return(type,__res); \
}

虽然艰涩难懂,还是简要梳理以下

  • 这块代码都是被宏__NO_STUBS包着,而这个东西就是在上面syscall.c中定义的,所以,这些东西都不会被放到syscall.c里面
  • 这一块代码在__KERNEL_SYSCALLS__这个宏的控制范围下的内容,根据我的理解,是系统调用对内核的接口。原因很简单:都是一些嵌入式汇编,一看就是要和模式切换,中断开关打交道
  • 这些宏的作用比较容易看出来:就是参数列表长度不同的系统调用的接口

3.系统调用的用户接口

这一部分比起上一部分友好了很多,一看明白

#else /* __KERNEL_SYSCALLS__ */

/*
 * we need this inline - forking from kernel space will result
 * in NO COPY ON WRITE (!!!), until an execve is executed. This
 * is no problem, but for the stack. This is handled by not letting
 * main() use the stack at all after fork(). Thus, no function
 * calls - which means inline code for fork too, as otherwise we
 * would use the stack upon exit from 'fork()'.
 *
 * Actually only pause and fork are needed inline, so that there
 * won't be any messing with the stack from main(), but we define
 * some others too.
 */
#define __NR__exit __NR_exit

extern pid_t sys_setsid(void);
static inline pid_t setsid(void)
{
    return sys_setsid();
}

long sys_write(int fd, const char *buf, size_t size);
static inline ssize_t write(unsigned int fd, char * buf, size_t count)
{
    return sys_write(fd, buf, count);
}

extern ssize_t sys_read(unsigned int, char *, size_t);
static inline ssize_t read(unsigned int fd, char * buf, size_t count)
{
    return sys_read(fd, buf, count);
}

extern off_t sys_lseek(unsigned int, off_t, unsigned int);
static inline off_t lseek(unsigned int fd, off_t offset, unsigned int origin)
{
    return sys_lseek(fd, offset, origin);
}

extern long sys_dup(unsigned int);
static inline long dup(unsigned int fd)
{
    return sys_dup(fd);
}

/* implemented in asm in arch/x86_64/kernel/entry.S */
extern long execve(char *, char **, char **);

extern long sys_open(const char *, int, int);
static inline long open(const char * filename, int flags, int mode)
{
    return sys_open(filename, flags, mode);
}

extern long sys_close(unsigned int);
static inline long close(unsigned int fd)
{
    return sys_close(fd);
}

extern long sys_exit(int) __attribute__((noreturn));
extern inline long exit(int error_code)
{
    sys_exit(error_code);
}

struct rusage; 
long sys_wait4(pid_t pid,unsigned int * stat_addr, 
            int options, struct rusage * ru);
static inline pid_t waitpid(int pid, int * wait_stat, int flags)
{
    return sys_wait4(pid, wait_stat, flags, NULL);
}

#endif /* __KERNEL_SYSCALLS__ */

#endif /* __NO_STUBS */

/*
 * "Conditional" syscalls
 *
 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
 * but it doesn't work on all toolchains, so we just do it by hand
 */
#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");

#endif

简要分析:

  • 看起来,这里的所有系统调用函数的本体是分布在不同模块的代码中的,这里全部都是extern定义
  • 另外,这里出现了我们最熟悉的系统调用的定义,而这一系列不过都是转发,真正的操作是由第二部分提到的接口进入内核,由内核完成的

总结

​ 这一部分和fork还不太沾边,但至少我们知道了当我们调用fork的时候,究竟发生了什么?都是定义在C源文件里的函数,系统调用和一般API有什么区别?

​ 下面我们将进一步深入,看看内核态下,fork都完成了什么工作

起点&终点

在之前的unistd.h中,我们在系统调用表中找到这样一行:

...
#define __NR_fork                               57
__SYSCALL(__NR_fork, stub_fork)
...

fork的系统调用号被接到了stub_fork函数上,这个函数。通览全文,stub_fork唯一的定义在:

${LINUX_SOURCE}\arch\x86_64\kernel\entry.S

    .macro PTREGSCALL label,func
    .globl \label
\label:
    leaq    \func(%rip),%rax
    jmp ptregscall_common
    .endm

    PTREGSCALL stub_clone, sys_clone
    PTREGSCALL stub_fork, sys_fork
    PTREGSCALL stub_vfork, sys_vfork
    PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend
    PTREGSCALL stub_sigaltstack, sys_sigaltstack
    PTREGSCALL stub_iopl, sys_iopl

ENTRY(ptregscall_common)
    popq %r11
    SAVE_REST
    movq %r11, %r15
    FIXUP_TOP_OF_STACK %r11
    call *%rax
    RESTORE_TOP_OF_STACK %r11
    movq %r15, %r11
    RESTORE_REST
    pushq %r11
    ret

而entry.S是系统调用入口,一切的起点

/*
 * entry.S contains the system-call and fault low-level handling routines.
 *
 * NOTE: This code handles signal-recognition, which happens every time
 * after an interrupt and after each system call.
 * 
 * Normal syscalls and interrupts don't save a full stack frame, this is 
 * only done for syscall tracing, signals or fork/exec et.al.
 * 
 * A note on terminology:    
 * - top of stack: Architecture defined interrupt frame from SS to RIP 
 * at the top of the kernel process stack.  
 * - partial stack frame: partially saved registers upto R11.
 * - full stack frame: Like partial stack frame, but all register saved. 
 *  
 * TODO:     
 * - schedule it carefully for the final hardware.
 */

一上来就有这样一段注释,其中说明了一些需要保存全部堆栈帧的系统调用,fork就在其列。

紧接着就是大量的汇编,其中有一个宏ENTRY,定义在$LINUX_SOURCE/include/linkage.h中:

#define __ALIGN     .align 4,0x90
...
#define ENTRY(name) \
  .globl name; \
  ALIGN; \
  name:

.globl的意思应该是说将这个标签置为全局可见,所以说其他文件中的代码可以直接跳到ENTRY处

回过头来看上面包含stub_fork的部分,意思就比较清晰了:调用stub_fork的过程本质上是调用用sys_fork这个地址的的函数,只不过前面后面多加了一些栈操作。

风景区

不纠结于着大量的汇编,我们在${LINUX_SOURCE}\arch\x86_64\kernel\process.c处找到了sys_fork和他的好朋友们:

asmlinkage long sys_fork(struct pt_regs regs)
{
    return do_fork(SIGCHLD, regs.rsp, &regs, 0, NULL, NULL);
}

asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void *parent_tid, void *child_tid, struct pt_regs regs)
{
    if (!newsp)
        newsp = regs.rsp;
    return do_fork(clone_flags & ~CLONE_IDLETASK, newsp, &regs, 0, 
            parent_tid, child_tid);
}

/*
 * This is trivial, and on the face of it looks like it
 * could equally well be done in user mode.
 *
 * Not so, for quite unobvious reasons - register pressure.
 * In user mode vfork() cannot have a stack frame, and if
 * done by calling the "clone()" system call directly, you
 * do not have enough call-clobbered registers to hold all
 * the information you need.
 */
asmlinkage long sys_vfork(struct pt_regs regs)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, &regs, 0, 
            NULL, NULL);
}

这个样子基本上就清楚了,这一系列和进程产生有关的函数,全部由do_fork这个函数实现。

而这个函数就是正了八经声明在sched.h中,实现在fork.c中的

/* fork.c */
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long pid;

    if (unlikely(current->ptrace)) {
        trace = fork_traceflag (clone_flags);
        if (trace)
            clone_flags |= CLONE_PTRACE;
    }

    p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    pid = IS_ERR(p) ? PTR_ERR(p) : p->pid;

    if (!IS_ERR(p)) {
        struct completion vfork;

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        p->state = TASK_STOPPED;
        if (!(clone_flags & CLONE_STOPPED))
            wake_up_forked_process(p);  /* do this last */
        ++total_forks;

        if (unlikely (trace)) {
            current->ptrace_message = pid;
            ptrace_notify ((trace << 8) | SIGTRAP);
        }

        if (clone_flags & CLONE_VFORK) {
            wait_for_completion(&vfork);
            if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
        } else
            /*
             * Let the child process run first, to avoid most of the
             * COW overhead when the child exec()s afterwards.
             */
            set_need_resched();
    }
    return pid;
}

看一眼函数注释,我们就知道找对了地方。

1.do_fork参数分析

首先是调用参数的解析,结合上面fork系列函数的定义,我们可以分析出do_fork各个参数的意义

  • clone_flags:定义了do_fork操作中需要复制的内容,定义在sched.h中:

    /*
    * cloning flags:
    */
    
    #define CSIGNAL       0x000000ff  /* signal mask to be sent at exit */
    
    
    #define CLONE_VM  0x00000100  /* set if VM shared between processes */
    
    
    #define CLONE_FS  0x00000200  /* set if fs info shared between processes */
    
    
    #define CLONE_FILES   0x00000400  /* set if open files shared between processes */
    
    
    #define CLONE_SIGHAND 0x00000800  /* set if signal handlers and blocked signals shared */
    
    
    #define CLONE_IDLETASK    0x00001000  /* set if new pid should be 0 (kernel only)*/
    
    
    #define CLONE_PTRACE  0x00002000  /* set if we want to let tracing continue on the child too */
    
    
    #define CLONE_VFORK   0x00004000  /* set if the parent wants the child to wake it up on mm_release */
    
    
    #define CLONE_PARENT  0x00008000  /* set if we want to have the same parent as the cloner */
    
    
    #define CLONE_THREAD  0x00010000  /* Same thread group? */
    
    
    #define CLONE_NEWNS   0x00020000  /* New namespace group? */
    
    
    #define CLONE_SYSVSEM 0x00040000  /* share system V SEM_UNDO semantics */
    
    
    #define CLONE_SETTLS  0x00080000  /* create a new TLS for the child */
    
    
    #define CLONE_PARENT_SETTID   0x00100000  /* set the TID in the parent */
    
    
    #define CLONE_CHILD_CLEARTID  0x00200000  /* clear the TID in the child */
    
    
    #define CLONE_DETACHED        0x00400000  /* Not used - CLONE_THREAD implies detached uniquely */
    
    
    #define CLONE_UNTRACED        0x00800000  /* set if the tracing process can't force CLONE_PTRACE on this clone */
    
    
    #define CLONE_CHILD_SETTID    0x01000000  /* set the TID in the child */
    
    
    #define CLONE_STOPPED     0x02000000  /* Start in stopped state */
    

    意思都非常明确,注释说的一清二楚,那么我们回过头,看看fork,vfork和clone分别用了什么标识:

    • sys_fork:SIGCHLD标识定义在signal.h中,值为17,并不在上述flag中,我们可以认为它什么特殊操作也没有采用
    • sys_clone:这个操作基本上就是对do_fork进行了一个简单的转接,但是用户输入的flag中不能够包括CLONE_IDLETASK这个标识
    • sys_vfork:在sys_fork的基础上增加了CLONE_VFORK,CLONE_VM两个标识VFORK指定这次frok行为是VFORK,在收到子进程退出信号之前父进程会阻塞;VM标识则说明子进程和父进程共享同一内存空间
  • stack_start:按照字面意思就是栈的起点,看看三个fork和vfork,给出的值都是regs.rsp,意思就是堆栈指针寄存器,而clone给出的是用户输入的参数newsp。者就告诉我们如果想要使用clone,需要手动申请内核栈

  • regs:这个意思也非常明显,看看参数类型就明白了——寄存器指针

  • stack_size:意思是栈大小,然而很有意思的是fork系列中该参数始终是0,==让人摸不着头脑==

  • parent/child_tid:虽然搞不明白tid是什么,但是前面那个__user宏能说明很多问题,定义在${LINUX_SOURCE}/include/compiler.h中:

    
    # define __user       __attribute__((noderef, address_space(1)))
    
    
    # define __kernel /* default address space */
    

    address_space这个名词非常的可疑,考虑以下执行这段代码的进程,他现在处于的是内核空间,所以__kernel这个宏什么都没有,而__这个宏却有一些让人看不懂的东西。估计目的是说明这地址位于用户空间,而不是内核空间

2.do_fork内容分析

至此,参数我们已经分析完毕了,下面我们来仔细看看do_fork都做了些什么:

变量准备

用了这样一段代码,完成了需要贯穿整个函数体的变量的初始化工作:

struct task_struct *p;
    int trace = 0;
    long pid;

    if (unlikely(current->ptrace)) {
        trace = fork_traceflag (clone_flags);
        if (trace)
            clone_flags |= CLONE_PTRACE;
    }

其中if代码块表名子进程根据父进程的要求设定自己的追踪信息标识,具体逻辑定义在fork.c的fork_traceflag中,如下:

static inline int fork_traceflag (unsigned clone_flags)
{
    if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK))
        return 0;
    else if (clone_flags & CLONE_VFORK) {
        if (current->ptrace & PT_TRACE_VFORK)
            return PTRACE_EVENT_VFORK;
    } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
        if (current->ptrace & PT_TRACE_CLONE)
            return PTRACE_EVENT_CLONE;
    } else if (current->ptrace & PT_TRACE_FORK)
        return PTRACE_EVENT_FORK;

    return 0;
}

PCB的初始化

接下来通过copy_process函数,构造新的PCB,并把指针返回给变量p:

p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);

copy_process的实现也在fork.c中,内容相当丰富:

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
struct task_struct *copy_process(unsigned long clone_flags,
                 unsigned long stack_start,
                 struct pt_regs *regs,
                 unsigned long stack_size,
                 int __user *parent_tidptr,
                 int __user *child_tidptr)
{
    int retval;
    struct task_struct *p = NULL;

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);

    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);

    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);

    /*
     * CLONE_DETACHED must match CLONE_THREAD: it's a historical
     * thing.
     */
    if (!(clone_flags & CLONE_DETACHED) != !(clone_flags & CLONE_THREAD)) {
        /* Warn about the old no longer supported case so that we see it */
        if (clone_flags & CLONE_THREAD) {
            static int count;
            if (count < 5) {
                count++;
                printk(KERN_WARNING "%s trying to use CLONE_THREAD without CLONE_DETACH\n", current->comm);
            }
        }
        return ERR_PTR(-EINVAL);
    }

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;

    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;

    retval = -EAGAIN;
    if (atomic_read(&p->user->processes) >=
            p->rlim[RLIMIT_NPROC].rlim_cur) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                p->user != &root_user)
            goto bad_fork_free;
    }

    atomic_inc(&p->user->__count);
    atomic_inc(&p->user->processes);

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;

    if (!try_module_get(p->thread_info->exec_domain->module))
        goto bad_fork_cleanup_count;

    if (p->binfmt && !try_module_get(p->binfmt->module))
        goto bad_fork_cleanup_put_domain;

#ifdef CONFIG_PREEMPT
    /*
     * schedule_tail drops this_rq()->lock so we compensate with a count
     * of 1.  Also, we want to start with kernel preemption disabled.
     */
    p->thread_info->preempt_count = 1;
#endif
    p->did_exec = 0;
    p->state = TASK_UNINTERRUPTIBLE;

    copy_flags(clone_flags, p);
    if (clone_flags & CLONE_IDLETASK)
        p->pid = 0;
    else {
        p->pid = alloc_pidmap();
        if (p->pid == -1)
            goto bad_fork_cleanup;
    }
    retval = -EFAULT;
    if (clone_flags & CLONE_PARENT_SETTID)
        if (put_user(p->pid, parent_tidptr))
            goto bad_fork_cleanup;

    p->proc_dentry = NULL;

    INIT_LIST_HEAD(&p->run_list);

    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    INIT_LIST_HEAD(&p->posix_timers);
    init_waitqueue_head(&p->wait_chldexit);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);
    spin_lock_init(&p->switch_lock);
    spin_lock_init(&p->proc_lock);

    clear_tsk_thread_flag(p, TIF_SIGPENDING);
    init_sigpending(&p->pending);

    p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
    p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
    init_timer(&p->real_timer);
    p->real_timer.data = (unsigned long) p;

    p->leader = 0;      /* session leadership doesn't inherit */
    p->tty_old_pgrp = 0;
    p->utime = p->stime = 0;
    p->cutime = p->cstime = 0;
    p->array = NULL;
    p->lock_depth = -1;     /* -1 = no lock */
    p->start_time = get_jiffies_64();
    p->security = NULL;
    p->io_context = NULL;

    retval = -ENOMEM;
    if ((retval = security_task_alloc(p)))
        goto bad_fork_cleanup;
    /* copy all the process information */
    if ((retval = copy_semundo(clone_flags, p)))
        goto bad_fork_cleanup_security;
    if ((retval = copy_files(clone_flags, p)))
        goto bad_fork_cleanup_semundo;
    if ((retval = copy_fs(clone_flags, p)))
        goto bad_fork_cleanup_files;
    if ((retval = copy_sighand(clone_flags, p)))
        goto bad_fork_cleanup_fs;
    if ((retval = copy_signal(clone_flags, p)))
        goto bad_fork_cleanup_sighand;
    if ((retval = copy_mm(clone_flags, p)))
        goto bad_fork_cleanup_signal;
    if ((retval = copy_namespace(clone_flags, p)))
        goto bad_fork_cleanup_mm;
    retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_namespace;

    p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
    /*
     * Clear TID on mm_release()?
     */
    p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;

    /*
     * Syscall tracing should be turned off in the child regardless
     * of CLONE_PTRACE.
     */
    clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);

    /* Our parent execution domain becomes current domain
       These must match for thread signalling to apply */

    p->parent_exec_id = p->self_exec_id;

    /* ok, now we should be set up.. */
    p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
    p->pdeath_signal = 0;

    /*
     * Share the timeslice between parent and child, thus the
     * total amount of pending timeslices in the system doesn't change,
     * resulting in more scheduling fairness.
     */
    local_irq_disable();
        p->time_slice = (current->time_slice + 1) >> 1;
    /*
     * The remainder of the first timeslice might be recovered by
     * the parent if the child exits early enough.
     */
    p->first_time_slice = 1;
    current->time_slice >>= 1;
    p->timestamp = sched_clock();
    if (!current->time_slice) {
        /*
         * This case is rare, it happens when the parent has only
         * a single jiffy left from its timeslice. Taking the
         * runqueue lock is not a problem.
         */
        current->time_slice = 1;
        preempt_disable();
        scheduler_tick(0, 0);
        local_irq_enable();
        preempt_enable();
    } else
        local_irq_enable();
    /*
     * Ok, add it to the run-queues and make it
     * visible to the rest of the system.
     *
     * Let it rip!
     */
    p->tgid = p->pid;
    p->group_leader = p;
    INIT_LIST_HEAD(&p->ptrace_children);
    INIT_LIST_HEAD(&p->ptrace_list);

    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);
    /*
     * Check for pending SIGKILL! The new thread should not be allowed
     * to slip out of an OOM kill. (or normal SIGKILL.)
     */
    if (sigismember(&current->pending.signal, SIGKILL)) {
        write_unlock_irq(&tasklist_lock);
        retval = -EINTR;
        goto bad_fork_cleanup_namespace;
    }

    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & CLONE_PARENT)
        p->real_parent = current->real_parent;
    else
        p->real_parent = current;
    p->parent = p->real_parent;

    if (clone_flags & CLONE_THREAD) {
        spin_lock(&current->sighand->siglock);
        /*
         * Important: if an exit-all has been started then
         * do not create this new thread - the whole thread
         * group is supposed to exit anyway.
         */
        if (current->signal->group_exit) {
            spin_unlock(&current->sighand->siglock);
            write_unlock_irq(&tasklist_lock);
            retval = -EAGAIN;
            goto bad_fork_cleanup_namespace;
        }
        p->tgid = current->tgid;
        p->group_leader = current->group_leader;

        if (current->signal->group_stop_count > 0) {
            /*
             * There is an all-stop in progress for the group.
             * We ourselves will stop as soon as we check signals.
             * Make the new thread part of that group stop too.
             */
            current->signal->group_stop_count++;
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        spin_unlock(&current->sighand->siglock);
    }

    SET_LINKS(p);
    if (p->ptrace & PT_PTRACED)
        __ptrace_link(p, current->parent);

    attach_pid(p, PIDTYPE_PID, p->pid);
    if (thread_group_leader(p)) {
        attach_pid(p, PIDTYPE_TGID, p->tgid);
        attach_pid(p, PIDTYPE_PGID, process_group(p));
        attach_pid(p, PIDTYPE_SID, p->session);
        if (p->pid)
            __get_cpu_var(process_counts)++;
    } else
        link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);

    nr_threads++;
    write_unlock_irq(&tasklist_lock);
    retval = 0;

fork_out:
    if (retval)
        return ERR_PTR(retval);
    return p;

bad_fork_cleanup_namespace:
    exit_namespace(p);
bad_fork_cleanup_mm:
    exit_mm(p);
bad_fork_cleanup_signal:
    exit_signal(p);
bad_fork_cleanup_sighand:
    exit_sighand(p);
bad_fork_cleanup_fs:
    exit_fs(p); /* blocking */
bad_fork_cleanup_files:
    exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
    exit_sem(p);
bad_fork_cleanup_security:
    security_task_free(p);
bad_fork_cleanup:
    if (p->pid > 0)
        free_pidmap(p->pid);
    if (p->binfmt)
        module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
    module_put(p->thread_info->exec_domain->module);
bad_fork_cleanup_count:
    atomic_dec(&p->user->processes);
    free_uid(p->user);
bad_fork_free:
    free_task(p);
    goto fork_out;
}

我们把这些内容切分成几块来看:

  1. 错误检查:验证参数是否有效,由于这里是内核,出现了异常参数就不是段错误那么简单的问题了,所以必须严格检查:

    if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
        return ERR_PTR(-EINVAL);
    
    /*
     * Thread groups must share signals as well, and detached threads
     * can only be started up within the thread group.
     */
    if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
        return ERR_PTR(-EINVAL);
    
    /*
     * Shared signal handlers imply shared VM. By way of the above,
     * thread groups also imply shared VM. Blocking this case allows
     * for various simplifications in other code.
     */
    if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
        return ERR_PTR(-EINVAL);
    
    /*
     * CLONE_DETACHED must match CLONE_THREAD: it's a historical
     * thing.
     */
    if (!(clone_flags & CLONE_DETACHED) != !(clone_flags & CLONE_THREAD)) {
        /* Warn about the old no longer supported case so that we see it */
        if (clone_flags & CLONE_THREAD) {
            static int count;
            if (count < 5) {
                count++;
                printk(KERN_WARNING "%s trying to use CLONE_THREAD without CLONE_DETACH\n", current->comm);
            }
        }
        return ERR_PTR(-EINVAL);
    }

    其中,ERR_PTR这个函数定义在${LINUX_SOURCE}/include/linux/err.h中,内容十分简单

    static inline void *ERR_PTR(long error)
    {
    return (void *) error;
    }

    将错误号转化成指针类型返回

    而printk是内核中使用的printf,功能基本上相近,此处不再展开

  2. 初始化新的数据结构,内容复制,检查进程数是否超限,更新用户信息

    接下来的工作就像上面描述的顺序一样:

    retval = security_task_create(clone_flags);
    if (retval)
        goto fork_out;
    
    retval = -ENOMEM;
    p = dup_task_struct(current);
    if (!p)
        goto fork_out;
    
    retval = -EAGAIN;
    if (atomic_read(&p->user->processes) >=
            p->rlim[RLIMIT_NPROC].rlim_cur) {
        if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                p->user != &root_user)
            goto bad_fork_free;
    }
    
    atomic_inc(&p->user->__count);
    atomic_inc(&p->user->processes);

    同样,也是每个操作后都要检查合法性

    • security_create函数是对一个“方法”的转发,定义在${LINUX_SOURCE}/include/linux/security.h中,内容如下:

      static inline int security_task_create (unsigned long clone_flags)
      {
      return security_ops->task_create (clone_flags);
      }

      而security_ops这个结构体也定义在这个文件当中,成员全部是函数指针

    • dup_task_struct:真正完成了新PCB的创建,并复制了父进程(当前进程)PCB的内容,如下:

      static struct task_struct *dup_task_struct(struct task_struct *orig)
      {
      struct task_struct *tsk;
      struct thread_info *ti;
      
      prepare_to_copy(orig);
      
      tsk = alloc_task_struct();
      if (!tsk)
          return NULL;
      
      ti = alloc_thread_info(tsk);
      if (!ti) {
          free_task_struct(tsk);
          return NULL;
      }
      
      *ti = *orig->thread_info;
      *tsk = *orig;
      tsk->thread_info = ti;
      ti->task = tsk;
      
      /* One for us, one for whoever does the "release_task()" (usually parent) */
      atomic_set(&tsk->usage,2);
      return tsk;
      }
      • prepare_to_copy函数定义在process.c中,对父进程的PCB做了一系列置位操作
      • alloc_task_struct函数在slab中为申请一个新的PCB空间;同样的alloc_thread_info也在对应的slab中为申请了一块新的空间。而对应的如果这些过程出现问题,就直接通过free_task_struct释放掉申请的空间
  3. 检查进程数是否溢出并检查模块可用性

    /*
     * If multiple threads are within copy_process(), then this check
     * triggers too late. This doesn't hurt, the check is only there
     * to stop root fork bombs.
     */
    if (nr_threads >= max_threads)
        goto bad_fork_cleanup_count;
    
    if (!try_module_get(p->thread_info->exec_domain->module))
        goto bad_fork_cleanup_count;
    
    if (p->binfmt && !try_module_get(p->binfmt->module))
        goto bad_fork_cleanup_put_domain;

    这里的max_thread是一个全局变量,赋值在fork_init函数中,如下:

    max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;

    看起来是根据内存大小决定的。

  4. 完成新PCB标志的复制与设置:

    
    #ifdef CONFIG_PREEMPT
    
    /*
     * schedule_tail drops this_rq()->lock so we compensate with a count
     * of 1.  Also, we want to start with kernel preemption disabled.
     */
    p->thread_info->preempt_count = 1;
    
    #endif
    
    p->did_exec = 0;
    p->state = TASK_UNINTERRUPTIBLE;
    
    copy_flags(clone_flags, p);
    if (clone_flags & CLONE_IDLETASK)
        p->pid = 0;
    else {
        p->pid = alloc_pidmap();
        if (p->pid == -1)
            goto bad_fork_cleanup;
    }
    retval = -EFAULT;
    if (clone_flags & CLONE_PARENT_SETTID)
        if (put_user(p->pid, parent_tidptr))
            goto bad_fork_cleanup;
  5. 进程同步机制、元数据的初始化:

    p->proc_dentry = NULL;
    
    INIT_LIST_HEAD(&p->run_list);
    
    INIT_LIST_HEAD(&p->children);
    INIT_LIST_HEAD(&p->sibling);
    INIT_LIST_HEAD(&p->posix_timers);
    init_waitqueue_head(&p->wait_chldexit);
    p->vfork_done = NULL;
    spin_lock_init(&p->alloc_lock);
    spin_lock_init(&p->switch_lock);
    spin_lock_init(&p->proc_lock);
    
    clear_tsk_thread_flag(p, TIF_SIGPENDING);
    init_sigpending(&p->pending);
    
    p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
    p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
    init_timer(&p->real_timer);
    p->real_timer.data = (unsigned long) p;
    
    p->leader = 0;      /* session leadership doesn't inherit */
    p->tty_old_pgrp = 0;
    p->utime = p->stime = 0;
    p->cutime = p->cstime = 0;
    p->array = NULL;
    p->lock_depth = -1;     /* -1 = no lock */
    p->start_time = get_jiffies_64();
    p->security = NULL;
    p->io_context = NULL;
    
  6. 根据clone_flags为子进程的各个描述符复制:

    retval = -ENOMEM;
    if ((retval = security_task_alloc(p)))
        goto bad_fork_cleanup;
    /* copy all the process information */
    if ((retval = copy_semundo(clone_flags, p)))
        goto bad_fork_cleanup_security;
    if ((retval = copy_files(clone_flags, p)))
        goto bad_fork_cleanup_semundo;
    if ((retval = copy_fs(clone_flags, p)))
        goto bad_fork_cleanup_files;
    if ((retval = copy_sighand(clone_flags, p)))
        goto bad_fork_cleanup_fs;
    if ((retval = copy_signal(clone_flags, p)))
        goto bad_fork_cleanup_sighand;
    if ((retval = copy_mm(clone_flags, p)))
        goto bad_fork_cleanup_signal;
    if ((retval = copy_namespace(clone_flags, p)))
        goto bad_fork_cleanup_mm;
    retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    if (retval)
        goto bad_fork_cleanup_namespace;

    这里出现的copy函数一般都采用如下策略:

    • clone_flags中设置了相关的共享标识,则直接将父进程(当前进程)的相关描述符的地址返回
    • 否则,为子进程重新建立一个描述符
  7. 父进程将自己的时间片分给子进程

    /*
     * Share the timeslice between parent and child, thus the
     * total amount of pending timeslices in the system doesn't change,
     * resulting in more scheduling fairness.
     */
    local_irq_disable();
           p->time_slice = (current->time_slice + 1) >> 1;
    /*
     * The remainder of the first timeslice might be recovered by
     * the parent if the child exits early enough.
     */
    p->first_time_slice = 1;
    current->time_slice >>= 1;
    p->timestamp = sched_clock();
    if (!current->time_slice) {
        /*
         * This case is rare, it happens when the parent has only
         * a single jiffy left from its timeslice. Taking the
         * runqueue lock is not a problem.
         */
        current->time_slice = 1;
        preempt_disable();
        scheduler_tick(0, 0);
        local_irq_enable();
        preempt_enable();
    } else
        local_irq_enable();
    • 涉及到时间片的分割,首先上来就把中断关了。local_irq_enable和local_irq_disable都是宏,内容是汇编的清位和置位
    • 子进程如果结束的够快(一个时间片之内),父进程可以将first_time_slice域中的时间收回来。这是从注释中获得的信息,具体实现还要看进程结束的逻辑
    • 如果父进程没有时间片了,就再给父进程一个时间片
    • 打开中断
  8. 将新进程加入进程树

    /*
     * Ok, add it to the run-queues and make it
     * visible to the rest of the system.
     *
     * Let it rip!
     */
    p->tgid = p->pid;
    p->group_leader = p;
    INIT_LIST_HEAD(&p->ptrace_children);
    INIT_LIST_HEAD(&p->ptrace_list);
    
    /* Need tasklist lock for parent etc handling! */
    write_lock_irq(&tasklist_lock);
    /*
     * Check for pending SIGKILL! The new thread should not be allowed
     * to slip out of an OOM kill. (or normal SIGKILL.)
     */
    if (sigismember(&current->pending.signal, SIGKILL)) {
        write_unlock_irq(&tasklist_lock);
        retval = -EINTR;
        goto bad_fork_cleanup_namespace;
    }
    
    /* CLONE_PARENT re-uses the old parent */
    if (clone_flags & CLONE_PARENT)
        p->real_parent = current->real_parent;
    else
        p->real_parent = current;
    p->parent = p->real_parent;
    
    if (clone_flags & CLONE_THREAD) {
        spin_lock(&current->sighand->siglock);
        /*
         * Important: if an exit-all has been started then
         * do not create this new thread - the whole thread
         * group is supposed to exit anyway.
         */
        if (current->signal->group_exit) {
            spin_unlock(&current->sighand->siglock);
            write_unlock_irq(&tasklist_lock);
            retval = -EAGAIN;
            goto bad_fork_cleanup_namespace;
        }
        p->tgid = current->tgid;
        p->group_leader = current->group_leader;
    
        if (current->signal->group_stop_count > 0) {
            /*
             * There is an all-stop in progress for the group.
             * We ourselves will stop as soon as we check signals.
             * Make the new thread part of that group stop too.
             */
            current->signal->group_stop_count++;
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }
    
        spin_unlock(&current->sighand->siglock);
    }
    • 由于CLONE_PARENT的存在,新进程的父进程到底是谁还是个问题:如果该标识设置,那么父进程应该是当前进程的父进程。
    • 同时,如果子进程设置了CLONE_THREAD标识,那么子进程将加入父进程的线程组
  9. 将新的PCB与系统中维护的PID数据结构联系起来

    SET_LINKS(p);
    if (p->ptrace & PT_PTRACED)
        __ptrace_link(p, current->parent);
    
    attach_pid(p, PIDTYPE_PID, p->pid);
    if (thread_group_leader(p)) {
        attach_pid(p, PIDTYPE_TGID, p->tgid);
        attach_pid(p, PIDTYPE_PGID, process_group(p));
        attach_pid(p, PIDTYPE_SID, p->session);
        if (p->pid)
            __get_cpu_var(process_counts)++;
    } els
        link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);
    
    nr_threads++;
    write_unlock_irq(&tasklist_lock);
    retval = 0;
    

    至此,copy_process完成,返回的PCB已经配置完毕,并与系统相关数据结构建立起了联系

收尾工作

等copy_process返回的时候,子进程已经基本建立完成了,下面的工作就是设置子进程状态,然后要求调度器重新调度。然而由于vfork的特殊性,在这个标识下还需要对父进程进行一系列操作

if (!IS_ERR(p)) {
        struct completion vfork;

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
        }

        p->state = TASK_STOPPED;
        if (!(clone_flags & CLONE_STOPPED))
            wake_up_forked_process(p);  /* do this last */
        ++total_forks;

        if (unlikely (trace)) {
            current->ptrace_message = pid;
            ptrace_notify ((trace << 8) | SIGTRAP);
        }

        if (clone_flags & CLONE_VFORK) {
            wait_for_completion(&vfork);
            if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
        } else
            /*
             * Let the child process run first, to avoid most of the
             * COW overhead when the child exec()s afterwards.
             */
            set_need_resched();
    }

set_need_resched函数会设置父进程的TIF_NEED_RESCHED标识,下一次始终中断的时候,父进程就会放弃CPU,让新建的子进程执行

总结

一次成功的fork系统调用的流程,可以用下面这张流程图简单描述
这里写图片描述

参考资料&辅助工具

猜你喜欢

转载自blog.csdn.net/qq_37613882/article/details/79726466