内核中访问空指针（基于kernel-4.9）

在C语言中，我们定义了NULL来表示空指针，空指针是一个特殊的指针，它其实就是0指针，*p = NULL和*p=0是等价的写法。空指针是一个未赋值的指针，毫无意义的指针，如果访问到该地址，那么程序会出错。

如果在Linux应用程序中访问NULL指针：

会收到Segmentation Fault信号，一般行为是该用户进程会杀死自己，程序当然也可以捕获对应的信号自行处理，这种用户态的错误是不会导致系统crash的。

如果是在内核中访问到NULL指针分多种情况：

（1）如果是在内核态的进程上下文中访问，那么会执行oops动作，杀死当前进程，并打印相关信息。

（2）如果是在内核态的其他上下文中（比如中断上下文），那么系统会执行panic动作。

（3）如果内核中有配置panic_on_oops，那么上面发生oops的场景也会发生panic。

接下来我们来看一下代码中是如何执行的，当我们访问一个页表中不存在的地址时，那么CPU首先会触发一个异常，缺页异常由硬件自动触发。

首先我们需要看中断向量表，当发生data abort异常时，ARM64会首先去执行entry.S中定义的：

el1_da:
......
  mov x2, sp              // struct pt_regs 
bl  do_mem_abort   //主处理函数，C代码
......
el0_da:
/*                                                         
 * Data abort handling                                     
 */                                                        
mrs x26, far_el1                                           
// enable interrupts before calling the main handler       
enable_dbg_and_irq                                         
ct_user_exit                                               
clear_address_tag x0, x26                                  
mov x1, x25                                                
mov x2, sp                                                 
bl  do_mem_abort    //主处理函数，C代码                                       
b   ret_to_user

下面我们以此跟踪下去，可以看到会执行到arch/arm64/mm/fault.c:


/*
 * Dispatch a data abort to the relevant handler.
 */
asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
                     struct pt_regs *regs)
{
    const struct fault_info *inf = esr_to_fault_info(esr); //根据esr找到对应的fault_info数组成员
    struct siginfo info;
    if (!inf->fn(addr, esr, regs))
        return;
    pr_alert("Unhandled fault: %s (0x%08x) at 0x%016lx\n",
         inf->name, esr, addr);
    info.si_signo = inf->sig;
    info.si_errno = 0;
    info.si_code  = inf->code;
    info.si_addr  = (void __user *)addr;
    arm64_notify_die("", regs, &info, esr);
}

 static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
 {
     return fault_info + (esr & 63);
 }

下面来看fault info数组的定义：

static const struct fault_info fault_info[] = {
    { do_bad,       SIGBUS,  0,     "ttbr address size fault"   },
    { do_bad,       SIGBUS,  0,     "level 1 address size fault"    },
    { do_bad,       SIGBUS,  0,     "level 2 address size fault"    },
    { do_bad,       SIGBUS,  0,     "level 3 address size fault"    },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 0 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 1 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 2 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 3 translation fault" },
    { do_bad,       SIGBUS,  0,     "unknown 8"         },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 access flag fault" },
    { do_bad,       SIGBUS,  0,     "unknown 12"            },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 permission fault"  },
    { do_bad,       SIGBUS,  0,     "synchronous external abort"    },
    { do_bad,       SIGBUS,  0,     "unknown 17"            },
    { do_bad,       SIGBUS,  0,     "unknown 18"            },
    { do_bad,       SIGBUS,  0,     "unknown 19"            },
    { do_bad,       SIGBUS,  0,     "synchronous external abort (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous external abort (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous external abort (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous external abort (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous parity error"  },
    { do_bad,       SIGBUS,  0,     "unknown 25"            },
    { do_bad,       SIGBUS,  0,     "unknown 26"            },
    { do_bad,       SIGBUS,  0,     "unknown 27"            },
    { do_bad,       SIGBUS,  0,     "synchronous parity error (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous parity error (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous parity error (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "synchronous parity error (translation table walk)" },
    { do_bad,       SIGBUS,  0,     "unknown 32"            },
    { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"       },
    { do_bad,       SIGBUS,  0,     "unknown 34"            },
    { do_bad,       SIGBUS,  0,     "unknown 35"            },
    { do_bad,       SIGBUS,  0,     "unknown 36"            },
    { do_bad,       SIGBUS,  0,     "unknown 37"            },
    { do_bad,       SIGBUS,  0,     "unknown 38"            },
    { do_bad,       SIGBUS,  0,     "unknown 39"            },
    { do_bad,       SIGBUS,  0,     "unknown 40"            },
    { do_bad,       SIGBUS,  0,     "unknown 41"            },
    { do_bad,       SIGBUS,  0,     "unknown 42"            },
    { do_bad,       SIGBUS,  0,     "unknown 43"            },
    { do_bad,       SIGBUS,  0,     "unknown 44"            },
    { do_bad,       SIGBUS,  0,     "unknown 45"            },
    { do_bad,       SIGBUS,  0,     "unknown 46"            },
    { do_bad,       SIGBUS,  0,     "unknown 47"            },
    { do_tlb_conf_fault,    SIGBUS,  0,     "TLB conflict abort"        },
    { do_bad,       SIGBUS,  0,     "unknown 49"            },
    { do_bad,       SIGBUS,  0,     "unknown 50"            },
    { do_bad,       SIGBUS,  0,     "unknown 51"            },
    { do_bad,       SIGBUS,  0,     "implementation fault (lockdown abort)" },
    { do_bad,       SIGBUS,  0,     "implementation fault (unsupported exclusive)" },
    { do_bad,       SIGBUS,  0,     "unknown 54"            },
    { do_bad,       SIGBUS,  0,     "unknown 55"            },
    { do_bad,       SIGBUS,  0,     "unknown 56"            },
    { do_bad,       SIGBUS,  0,     "unknown 57"            },
    { do_bad,       SIGBUS,  0,     "unknown 58"            },
    { do_bad,       SIGBUS,  0,     "unknown 59"            },
    { do_bad,       SIGBUS,  0,     "unknown 60"            },
    { do_bad,       SIGBUS,  0,     "section domain fault"      },
    { do_bad,       SIGBUS,  0,     "page domain fault"     },
    { do_bad,       SIGBUS,  0,     "unknown 63"            },
};

上面定义的fault info，主要关键的处理函数就如下几个，分别是：

do_bad/do_alignment_fault/do_tlb_conf_fault/do_translation_fault/do_page_fault。

回到最初的问题，如果我们想要访问的是指针0地址，那么硬件会执行到do_page_fault这个函数中来进行处理。我们来继续跟踪一下，看看这个函数具体做了什么吧。

static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
                   struct pt_regs *regs)
{
    struct task_struct *tsk;
    struct mm_struct *mm;
    int fault, sig, code;
    unsigned long vm_flags = VM_READ | VM_WRITE;
    unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
    if (notify_page_fault(regs, esr))
        return 0;
    tsk = current;
    mm  = tsk->mm;
    /*
     * If we're in an interrupt or have no user context, we must not take
     * the fault.
     */
if (faulthandler_disabled() || !mm) //faulthandler_disabled()函数判断是否faulthandler被disabled了或者是否处于中断上下文
                                     //!mm用来判断是否没有用户进程的上下文，如果都处于中断上下文或者没有用户上下文，那么直接跳转
        goto no_context;
    if (user_mode(regs))
        mm_flags |= FAULT_FLAG_USER;
    if (is_el0_instruction_abort(esr)) {
        vm_flags = VM_EXEC;
    } else if (((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) ||
            ((esr & ESR_ELx_CM) && !(mm_flags & FAULT_FLAG_USER))) {
        vm_flags = VM_WRITE;
        mm_flags |= FAULT_FLAG_WRITE;
    }
    if (addr < USER_DS && is_permission_fault(esr, regs)) {
        /* regs->orig_addr_limit may be 0 if we entered from EL0 */
        if (regs->orig_addr_limit == KERNEL_DS)
            die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
        if (is_el1_instruction_abort(esr))
            die("Attempting to execute userspace memory", regs, esr);
        if (!search_exception_tables(regs->pc))
            die("Accessing user space memory outside uaccess.h routines", regs, esr);
    }
    /*
     * As per x86, we may deadlock here. However, since the kernel only
     * validly references user space from well defined areas of the code,
     * we can bug out early if this is from code which shouldn't.
     */
    if (!down_read_trylock(&mm->mmap_sem)) {
        if (!user_mode(regs) && !search_exception_tables(regs->pc))
            goto no_context;
retry:
        down_read(&mm->mmap_sem);
    } else {
        /*
         * The above down_read_trylock() might have succeeded in which
         * case, we'll have missed the might_sleep() from down_read().
         */
        might_sleep();
#ifdef CONFIG_DEBUG_VM
        if (!user_mode(regs) && !search_exception_tables(regs->pc))
            goto no_context;
#endif
    }
    fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
    /*
     * If we need to retry but a fatal signal is pending, handle the
     * signal first. We do not need to release the mmap_sem because it
     * would already be released in __lock_page_or_retry in mm/filemap.c.
     */
    if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
        if (!user_mode(regs))
            goto no_context;
        return 0;
    }
    /*
     * Major/minor page fault accounting is only done on the initial
     * attempt. If we go through a retry, it is extremely likely that the
     * page will be found in page cache at that point.
     */
    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
    if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
        if (fault & VM_FAULT_MAJOR) {
            tsk->maj_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
                      addr);
        } else {
            tsk->min_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
                      addr);
        }
        if (fault & VM_FAULT_RETRY) {
            /*
             * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
             * starvation.
             */
            mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
            mm_flags |= FAULT_FLAG_TRIED;
            goto retry;
        }
    }
    up_read(&mm->mmap_sem);
    /*
     * Handle the "normal" case first - VM_FAULT_MAJOR
     */
    if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
                  VM_FAULT_BADACCESS))))
        return 0;
    /*
     * If we are in kernel mode at this point, we have no context to
     * handle this fault with.
     */
    if (!user_mode(regs))
        goto no_context;
    if (fault & VM_FAULT_OOM) {
        /*
         * We ran out of memory, call the OOM killer, and return to
         * userspace (which will retry the fault, or kill us if we got
         * oom-killed).
         */
        pagefault_out_of_memory();
        return 0;
    }
    if (fault & VM_FAULT_SIGBUS) {
        /*
         * We had some memory, but were unable to successfully fix up
         * this page fault.
         */
        sig = SIGBUS;
        code = BUS_ADRERR;
    } else {
        /*
         * Something tried to access memory that isn't in our memory
         * map.
         */
        sig = SIGSEGV;
        code = fault == VM_FAULT_BADACCESS ?
            SEGV_ACCERR : SEGV_MAPERR;
    }
    __do_user_fault(tsk, addr, esr, sig, code, regs); //在用户进程上下文中发现地址无法被映射，比如地址0,那么就会执行到此处，do user fault
    return 0;
no_context:   //没有进程上下文的情况会直接跳到此执行do kernel fault
    __do_kernel_fault(mm, addr, esr, regs);
    return 0;
}

此函数的中间部分都是进行缺页异常的处理，比如重新建立页表，从伙伴系统分配内存等等，我们暂且不去进一步分析。只看最后出现异常的情况，异常分两种，一种是有用户上下文，一种没有用户上下文，分别会去执行__do_user_fault和__do_kernel_fault。这不就是和我们前面的描述，两种访问0地址的方式场景匹配了嘛。

/*
 * Something tried to access memory that isn't in our memory map. User mode
 * accesses just cause a SIGSEGV
 */
static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
                unsigned int esr, unsigned int sig, int code,
                struct pt_regs *regs)
{
    struct siginfo si;
    const struct fault_info *inf;
    trace_user_fault(tsk, addr, esr);
    if (unhandled_signal(tsk, sig) && show_unhandled_signals_ratelimited()) {
        inf = esr_to_fault_info(esr);
        pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n",
            tsk->comm, task_pid_nr(tsk), inf->name, sig,
            addr, esr);
        show_pte(tsk->mm, addr);
        show_regs(regs);
    }
    tsk->thread.fault_address = addr;
    tsk->thread.fault_code = esr;
    si.si_signo = sig;
    si.si_errno = 0;
    si.si_code = code;
    si.si_addr = (void __user *)addr;
    force_sig_info(sig, &si, tsk);
}

/*
 * The kernel tried to access some page that wasn't present.
 */
static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
                  unsigned int esr, struct pt_regs *regs)
{
    /*
     * Are we prepared to handle this kernel fault?
     * We are almost certainly not prepared to handle instruction faults.
     */
    if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
        return;
    /*
     * No handler, we'll have to terminate things with extreme prejudice.
     */
    bust_spinlocks(1);
    pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
         (addr < PAGE_SIZE) ? "NULL pointer dereference" :
         "paging request", addr);
    show_pte(mm, addr);
    die("Oops", regs, esr);
    bust_spinlocks(0);
    do_exit(SIGKILL);
}

内核中访问空指针（基于kernel-4.9）

猜你喜欢