Linux kernel oops

本文以ARM64为例,介绍内核的Oops机制,我们使用grep搜索一下内核中可能会报Oops的地方:

./arch/arm64/kernel/sys_compat.c:142:	arm64_notify_die("Oops - bad compat syscall(2)", regs, &info, scno);
./arch/arm64/kernel/traps.c:771:	die("Oops - bad mode", regs, 0);
./arch/arm64/kernel/traps.c:929:		die("Oops - BUG", regs, 0);
./arch/arm64/mm/fault.c:270:	die("Oops", regs, esr);

搜索结果如上所示,一共有这几个地方定义为Oops,因此Oops可能包含如下一些场景:

  1. 64bit 系统调用发生了错误,报Oops
  2. CPU陷入了某种不正常的exception mode,在该exception对应的exception vector entry中直接报Oops
  3. traps中定义的BUG()函数被调用触发了Oops
  4. 内核空间中发生了内存地址相关的访问异常

本文着重从第4种情况来入手跟踪Oops的发生过程:

在代码文件 ./arch/arm64/mm/fault.c 中:

do_translation_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_alignment_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_page_fault --> __do_kernel_fault --> die_kernel_fault

调用路径如上所示,当内核访问一个内存地址发生错误时会分别调用 do_xxx_fault 该函数最终的目标是 die_kernel_fault:

static void die_kernel_fault(const char *msg, unsigned long addr,
                 unsigned int esr, struct pt_regs *regs)
{
    bust_spinlocks(1);

    pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
         addr);

    mem_abort_decode(esr);

    show_pte(addr);
    die("Oops", regs, esr);
    bust_spinlocks(0);
    do_exit(SIGKILL);
}

这里最终会调用 die("Oops", regs, esr) 函数:

/*
 * This function is protected against re-entrancy.
 */
void die(const char *str, struct pt_regs *regs, int err)
{
    int ret;
    unsigned long flags;

    raw_spin_lock_irqsave(&die_lock, flags);

    oops_enter();

    console_verbose();
    bust_spinlocks(1);
    ret = __die(str, err, regs); // 其中会发送 notify_die 通知

    if (regs && kexec_should_crash(current))
        crash_kexec(regs);

    bust_spinlocks(0);
    add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
    oops_exit();

    if (in_interrupt())
        panic("Fatal exception in interrupt");
    if (panic_on_oops)     // 判断是否要执行panic操作
        panic("Fatal exception");

    raw_spin_unlock_irqrestore(&die_lock, flags);

    if (ret != NOTIFY_STOP)
        do_exit(SIGSEGV);
}

在die中可以看到如果配置了panic_on_oops为1,那么才会直接触发panic操作,如果没有配置为1,并不会导致系统panic重启。Oops都会打印内核调用栈。

一种手动触发panic的机制

利用sysrq机制可以触发kernel crash:

echo c > /proc/sysrq-trigger

这种方式就是利用Oops机制来触发panic的:

static void sysrq_handle_crash(int key)
{
    char *killer = NULL;

    /* we need to release the RCU read lock here,
     * otherwise we get an annoying
     * 'BUG: sleeping function called from invalid context'
     * complaint from the kernel before the panic.
     */
    rcu_read_unlock();
    panic_on_oops = 1;  /* force panic */  //-------- (1)
    wmb();
    *killer = 1; //---------------------------(2)
}
  • 第(1)步先配置panic_on_oops为1,使得当内核oops时直接触发panic操作
  • 第(2)步访问一个内核NULL空地址,触发oops操作

到这里可能很多人会有一个疑惑,对一个内核空地址赋值,是如何产生了Oops呢?

查看异常arm64向量表:

 /*
  * EL1 mode handlers.
  */

 el1_da:
     /*
      * Data abort handling
      */
     mrs x3, far_el1
     inherit_daif    pstate=x23, tmp=x2
     clear_address_tag x0, x3
     mov x2, sp              // struct pt_regs
     bl  do_mem_abort

     kernel_exit 1
......

el0_da:
    /*
     * Data abort handling
     */
    mrs x26, far_el1
    enable_daif
    ct_user_exit
    clear_address_tag x0, x26
    mov x1, x25
    mov x2, sp
    bl  do_mem_abort
    b   ret_to_user


其中el1_da和el1_da中会调用到do_mem_abort,这个向量函数是在CPU运行时发生了data abort异常时进入的一种模式,并且会执行到向量表中对应的函数。

asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
                     struct pt_regs *regs)
{
    const struct fault_info *inf = esr_to_fault_info(esr);
    struct siginfo info;

    if (!inf->fn(addr, esr, regs))
        return;

    if (!user_mode(regs)) {
        pr_alert("Unhandled fault at 0x%016lx\n", addr);
        mem_abort_decode(esr);
        show_pte(addr);
    }

    clear_siginfo(&info);
    info.si_signo = inf->sig;
    info.si_errno = 0;
    info.si_code  = inf->code;
    info.si_addr  = (void __user *)addr;
    arm64_notify_die(inf->name, regs, &info, esr);
}

其中对应一个系统错误处理列表:

static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
{
    return fault_info + (esr & 63);
}

static const struct fault_info fault_info[] = {
    { do_bad,       SIGKILL, SI_KERNEL, "ttbr address size fault"   },
    { do_bad,       SIGKILL, SI_KERNEL, "level 1 address size fault"    },
    { do_bad,       SIGKILL, SI_KERNEL, "level 2 address size fault"    },
    { do_bad,       SIGKILL, SI_KERNEL, "level 3 address size fault"    },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 0 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 1 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 2 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 3 translation fault" },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 8"         },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 access flag fault" },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 12"            },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 permission fault"  },
    { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous external abort"    },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 17"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 18"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 19"            },
    { do_sea,       SIGKILL, SI_KERNEL, "level 0 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 1 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 2 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 3 (translation table walk)"  },
    { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous parity or ECC error" },    // Reserved when RAS is implemented
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 25"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 26"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 27"            },
    { do_sea,       SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 32"            },
    { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"       },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 34"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 35"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 36"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 37"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 38"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 39"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 40"            },
......

经过这一系列的调用,最终内核会运行对应的错误处理函数。

发布了234 篇原创文章 · 获赞 78 · 访问量 23万+

猜你喜欢

转载自blog.csdn.net/rikeyone/article/details/103464572