应用调试之配置内核输出应用程序的段错误信息

根据之前的调试我们知道，当应用程序出错的时候，会打印一些信息。但是有时候这些信息时不够的，那么我们能否配置内核使其输出更多的信息呢？答案是肯定的，本节我们就来详细讲一下：

还记得我们在运行firstdrvtest的时候曾经输出过如下一些错误信息：

Unable to handle kernel paging request at virtual address 56000050
pgd = c3c38000
[56000050] *pgd=00000000
Internal error: Oops: 5 [#1]
Modules linked in: first_drv
CPU: 0    Not tainted (2.6.22.6 #19)
PC is at first_drv_open+0x18/0x3c [first_drv]
LR is at chrdev_open+0x14c/0x164
pc : [<bf000018>]    lr : [<c008c888>]    psr: a0000013
sp : c3cede88 ip : c3cede98 fp : c3cede94
r10: 00000000 r9 : c3cec000 r8 : c3cf4700
r7 : 00000000 r6 : 00000000 r5 : c3e240a0 r4 : c06b5300
r3 : bf000000 r2 : 56000050 r1 : bf000964 r0 : 00000000
Flags: NzCv IRQs on FIQs on Mode SVC_32 Segment user
Control: c000717f Table: 33c38000 DAC: 00000015
Process firstdrvtest (pid: 749, stack limit = 0xc3cec258)
Stack: (0xc3cede88 to 0xc3cee000)
de80:                   c3cedebc c3cede98 c008c888 bf000010 00000000 c3cf4700
dea0: c3e240a0 c008c73c c0454da0 c001cf40 c3cedee4 c3cedec0 c0088e48 c008c74c
dec0: c3cf4700 c3cedf04 00000003 ffffff9c c002b044 c3e3d000 c3cedefc c3cedee8
dee0: c0088f64 c0088d58 00000000 00000002 c3cedf68 c3cedf00 c0088fb8 c0088f40
df00: c3cedf04 c001cf40 c0454da0 00000000 00000000 c3c39000 00000101 00000001
df20: 00000000 c3cec000 c046d688 c046d680 ffffffe8 c3e3d000 c3cedf68 c3cedf48
df40: c008916c c009ec70 00000003 00000000 c3cf4700 00000002 bee1de90 c3cedf94
df60: c3cedf6c c00892f4 c0088f88 00008520 bee1de84 0000860c 00008670 00000005
df80: c002b044 4013365c c3cedfa4 c3cedf98 c00893a8 c00892b0 00000000 c3cedfa8
dfa0: c002aea0 c0089394 bee1de84 0000860c 00008720 00000002 bee1de90 00000001
dfc0: bee1de84 0000860c 00008670 00000002 00008520 00000000 4013365c bee1de58
dfe0: 00000000 bee1de34 0000266c 400c98e0 60000010 00008720 00000000 00000000
Backtrace:
[<bf000000>] (first_drv_open+0x0/0x3c [first_drv]) from [<c008c888>] (chrdev_open+0x14c/0x164)
[<c008c73c>] (chrdev_open+0x0/0x164) from [<c0088e48>] (__dentry_open+0x100/0x1e8)
r8:c001cf40 r7:c0454da0 r6:c008c73c r5:c3e240a0 r4:c3cf4700
[<c0088d48>] (__dentry_open+0x0/0x1e8) from [<c0088f64>] (nameidata_to_filp+0x34/0x48)
[<c0088f30>] (nameidata_to_filp+0x0/0x48) from [<c0088fb8>] (do_filp_open+0x40/0x48)
r4:00000002
[<c0088f78>] (do_filp_open+0x0/0x48) from [<c00892f4>] (do_sys_open+0x54/0xe4)
r5:bee1de90 r4:00000002
[<c00892a0>] (do_sys_open+0x0/0xe4) from [<c00893a8>] (sys_open+0x24/0x28)
[<c0089384>] (sys_open+0x0/0x28) from [<c002aea0>] (ret_fast_syscall+0x0/0x2c)
Code: e24cb004 e59f1024 e3a00000 e5912000 (e5923000)
Segmentation fault

这些信息输出的原因我们已经知道了，是因为在内核态才用了物理地址的原因！

我们可以在内核的源代码里面的arch/arm目录下搜索"Unable to handle kernel "来找出是在哪里输出了这些错误信息：

grep "Unable to handle kernel " * -nR

输出如下信息：

Binary file boot/Image matches
mm/fault.c:93: "Unable to handle kernel %s at virtual address %08lx\n",
Binary file mm/fault.o matches
Binary file mm/built-in.o matches

我们看到在arch/arm/mm/fault.c文件的93行有这句话！

这句话在函数：__do_kernel_fault中
而__do_kernel_fault在do_bad_area中被调用：

void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->active_mm;

扫描二维码关注公众号，回复： 1376347 查看本文章

/*
* If we are in kernel mode at this point, we
* have no context to handle this fault with.
*/
if (user_mode(regs))  //在用户态出现错误用： __do_user_fault
  __do_user_fault(tsk, addr, fsr, SIGSEGV, SEGV_MAPERR, regs);
else
  __do_kernel_fault(mm, addr, fsr, regs);  //在内核态出现错误用：__do_kernel_fault
}

在用户态发生错误的话会调用__do_user_fault这个函数：

static void
__do_user_fault(struct task_struct *tsk, unsigned long addr,
unsigned int fsr, unsigned int sig, int code,
struct pt_regs *regs)
{
struct siginfo si;

#ifdef CONFIG_DEBUG_USER //1. 配置内核
if (user_debug & UDBG_SEGV) {

//2. uboot: set bootargs user_debug=0xff
  printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n",
         tsk->comm, sig, addr, fsr);
  show_pte(tsk->mm, addr);
  show_regs(regs);
}
#endif

因此如果我们需要在使用户态发生错误时打印更多信息的话，就需要两项配置：

1、在内核里面配置：配置内核

2、在u-boot里面设置启动参数：

set bootargs console=ttySAC0 root=/dev/nfs nfsroot=192.168.183.128:/home/share/jz2440/fs_mini_mdev ip=192.168.183.127:192.168.183.128:192.168.183.225:255.255.255.0::eth0:off user_debug=0xff

上面我们配置好了内核以及启动参数，下面我们开始实验一下：

./test_debug

之前只有两条输出信息：

a = 0x12

Segmentation fault

现在出现一大堆输出信息：

a = 0x12
pgd = c3c38000
[00000000] *pgd=33c4a031, *pte=00000000, *ppte=00000000

Pid: 745, comm:           test_debug
CPU: 0    Not tainted (2.6.22.6 #20)
PC is at 0x84ac
LR is at 0x84d0
pc : [<000084ac>]    lr : [<000084d0>]    psr: 60000010
sp : be8c7e30 ip : be8c7e44 fp : be8c7e40
r10: 4013365c r9 : 00000000 r8 : 00008514
r7 : 00000001 r6 : 000085cc r5 : 00008568 r4 : be8c7eb4
r3 : 00000012 r2 : 00000000 r1 : 00001000 r0 : 00000000
Flags: nZCv IRQs on FIQs on Mode USER_32 Segment user
Control: c000717f Table: 33c38000 DAC: 00000015
[<c002bd1c>] (show_regs+0x0/0x4c) from [<c0030a98>] (__do_user_fault+0x5c/0xa4)
r4:c04a8320
[<c0030a3c>] (__do_user_fault+0x0/0xa4) from [<c0030d38>] (do_page_fault+0x1dc/0x20c)
r7:c00251e0 r6:c0020908 r5:c04a8320 r4:ffffffec
[<c0030b5c>] (do_page_fault+0x0/0x20c) from [<c002a224>] (do_DataAbort+0x3c/0xa0)
[<c002a1e8>] (do_DataAbort+0x0/0xa0) from [<c002ae48>] (ret_from_exception+0x0/0x10)
Exception stack(0xc3c7bfb0 to 0xc3c7bff8)
bfa0:                                     00000000 00001000 00000000 00000012
bfc0: be8c7eb4 00008568 000085cc 00000001 00008514 00000000 4013365c be8c7e40
bfe0: be8c7e44 be8c7e30 000084d0 000084ac 60000010 ffffffff
r8:00008514 r7:00000001 r6:000085cc r5:00008568 r4:c038b6c8
Segmentation fault

这样我们就可以分析了：

我们从PC值：0x84ac来分析

（1）反汇编应用程序：arm-linux-objdump -D test_debug > test_debug.dis

（2）在test_debug.dis里面搜索0x84ac

00008490 <C>:
    8490: e1a0c00d mov ip, sp
    8494: e92dd800 stmdb sp!, {fp, ip, lr, pc}
    8498: e24cb004 sub fp, ip, #4 ; 0x4
    849c: e24dd004 sub sp, sp, #4 ; 0x4
    84a0: e50b0010 str r0, [fp, #-16]
    84a4: e51b2010 ldr r2, [fp, #-16]
    84a8: e3a03012 mov r3, #18 ; 0x12
    84ac: e5823000 str r3, [r2]
    84b0: e89da808 ldmia sp, {r3, fp, sp, pc}

我们看到错误出现在C函数里面，并且根据上下文确定了是那一句出现了错误！

在内核里面发生错误我们可以通过栈信息来推断函数的调用关系，那么用户态的程序发生错误的话，能不能也打印出来错误信息呢？答案是肯定的，我们再开看之前的函数：__do_user_fault

__do_user_fault(struct task_struct *tsk, unsigned long addr,
unsigned int fsr, unsigned int sig, int code,
struct pt_regs *regs)
{
struct siginfo si;

#ifdef CONFIG_DEBUG_USER

分析：struct pt_regs *regs这个结构体还记得吧，里面存放了发生错误时各个寄存器的值，我们把它里面的sp打印出来不久行了吗！

修改后的函数如下：

__do_user_fault(struct task_struct *tsk, unsigned long addr,
unsigned int fsr, unsigned int sig, int code,
struct pt_regs *regs)
{
struct siginfo si;

#ifdef CONFIG_DEBUG_USER

unsigned long ret;
unsigned long val ;
int i=0;

if (user_debug & UDBG_SEGV) {
printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n",
tsk->comm, sig, addr, fsr);
show_pte(tsk->mm, addr);
show_regs(regs);

printk("Stack: \n");
while(i<1024)
{
if(copy_from_user(&val, (const void __user *)(regs->ARM_sp+i*4), 4))
break;
printk("%08x ",val);
i++;
if(i%8==0)
printk("\n");
}
printk("\n END of Stack\n");
}
#endif

重新编译内核，并且重新启动开发板，运行之前的测试程序得到如下栈的信息：

00000000 be8efe54 be8efe44 000084d0 000084a0 || 00000000 be8efe68 be8efe58
000084f0 000084c4 || 00000000 be8efe88 be8efe6c 00008554 000084e4 || 00000000
00000012 be8efeb4 00000001 00000000 be8efe8c 40034f14 00008524 00000000
00000000 0000839c 00000000 00000000 4001d594 000083c4 000085cc 4000c02c
be8efeb4 be8eff6f 00000000 be8eff7c be8eff86 be8eff8f be8eff96 be8effa1
be8effc4 be8effd2 00000000 00000010 00000003 00000006 00001000 00000011
00000064 00000003 00008034 00000004 00000020 00000005 00000006 00000007
40000000 00000008 00000000 00000009 0000839c 0000000b 00000000 0000000c
00000000 0000000d 00000000 0000000e 00000000 00000017 00000000 0000000f
be8eff6b 00000000 00000000 00000000 00000000 00000000 76000000 2e006c34
7365742f 65645f74 00677562 52455355 6f6f723d 4c4f0074 44575044 48002f3d
3d454d4f 4554002f 763d4d52 32303174 54415000 732f3d48 3a6e6962 7273752f
6962732f 622f3a6e 2f3a6e69 2f727375 006e6962 4c454853 622f3d4c 732f6e69
57500068 6d2f3d44 632f746e 2f65646f 68743832 7070615f 6265645f 2e006775
7365742f 65645f74 00677562 0000000
END of Stack

我们试一下能否根据它来推测出来调用关系：

（1）

00008490 <C>:
8490: e1a0c00d mov ip, sp
8494: e92dd800 stmdb sp!, {fp, ip, lr, pc}  //四个，入栈顺序：pc、lr、ip、fp
8498: e24cb004 sub fp, ip, #4 ; 0x4
849c: e24dd004 sub sp, sp, #4 ; 0x4    //一个，总共5个（4代表一个）
84a0: e50b0010 str r0, [fp, #-16]
84a4: e51b2010 ldr r2, [fp, #-16]
84a8: e3a03012 mov r3, #18 ; 0x12
84ac: e5823000 str r3, [r2]   //出错位置
84b0: e89da808 ldmia sp, {r3, fp, sp, pc}

所以：lr=000084d0

查找：000084d0 ，发现如下信息：
000084b4 <B>:
    84b4: e1a0c00d mov ip, sp
    84b8: e92dd800 stmdb sp!, {fp, ip, lr, pc}//四个
    84bc: e24cb004 sub fp, ip, #4 ; 0x4
    84c0: e24dd004 sub sp, sp, #4 ; 0x4//一个
    84c4: e50b0010 str r0, [fp, #-16]
    84c8: e51b0010 ldr r0, [fp, #-16]
    84cc: ebffffef bl 8490 <C>
    84d0: e89da808 ldmia sp, {r3, fp, sp, pc}

所以：lr=000084f0

搜索：000084f0 ，发现如下信息：

000084d4 <A>:
    84d4: e1a0c00d mov ip, sp
    84d8: e92dd800 stmdb sp!, {fp, ip, lr, pc}//四个
    84dc: e24cb004 sub fp, ip, #4 ; 0x4
    84e0: e24dd004 sub sp, sp, #4 ; 0x4//一个
    84e4: e50b0010 str r0, [fp, #-16]
    84e8: e51b0010 ldr r0, [fp, #-16]
    84ec: ebfffff0 bl 84b4 <B>
    84f0: e89da808 ldmia sp, {r3, fp, sp, pc}

所以：lr=00008554

查找：00008554 ，找到如下信息：

00008514 <main>:
    8514: e1a0c00d mov ip, sp
    8518: e92dd800 stmdb sp!, {fp, ip, lr, pc}//4个
    851c: e24cb004 sub fp, ip, #4 ; 0x4
    8520: e24dd010 sub sp, sp, #16 ; 0x10 //4个
    8524: e50b0010 str r0, [fp, #-16]
    8528: e50b1014 str r1, [fp, #-20]
    852c: e3a03000 mov r3, #0 ; 0x0
    8530: e50b301c str r3, [fp, #-28]
    8534: e24b3018 sub r3, fp, #24 ; 0x18
    8538: e1a00003 mov r0, r3
    853c: ebffffec bl 84f4 <A2>
    8540: e59f001c ldr r0, [pc, #28] ; 8564 <.text+0x1c8>
    8544: e51b1018 ldr r1, [fp, #-24]
    8548: ebffff90 bl 8390 <.text-0xc>
    854c: e51b001c ldr r0, [fp, #-28]
    8550: ebffffdf bl 84d4 <A>
    8554: e3a03000 mov r3, #0 ; 0x0
    8558: e1a00003 mov r0, r3
    855c: e24bd00c sub sp, fp, #12 ; 0xc
    8560: e89da800 ldmia sp, {fp, sp, pc}
    8564: 0000867c andeq r8, r0, ip, ror r6

果然我们找到了调用关系：main->A->B->C

不过为了进一步深究，我们可以来看看是谁调用了main函数：

由main函数知道：

lr=40034f14

不太好分析，算了！直接给答案吧！

libc由库函数lobc_start_main来调用！

http://liu1227787871.blog.163.com/blog/static/2053631972012613102341318/

NULL Pointer是如何引发OOPS的

要想exploit这种bug，就必须先要了解内核是如何处理空指针引用的。
在程序的执行过程中，因为遇到某种障碍而使CPU无法最终访问到相应的物理内存单元，即无法完成从虚拟地址到物理地址映射的时候，
CPU 会产生一次缺页异常，从而进行相应的缺页异常处理。那么都在什么情况下会引发缺页异常呢，我们分别从用户空间和内核空间来看：

用户空间：
1、进程访问本身地址空间
---> 访问一个无效的内存地址（如mmap后，又unmap的一块内存）。
---> 由于用户堆栈用完导致的越界访问（用户进程堆栈空间已被用完，又有一次函数调用发生，这时push/pusha指令被写到进程的堆中。
---> 访问一个还未曾映射的空间。
2、进程访问其他进程空间
3、进程通过非系统调用方式访问内核空间。

内核空间：
1、中断程序，不可延迟程序，临界区代码访问用户空间（可能引起休眠）。
2、内核线程访问访问用户空间。（内核线程不能访问用户空间）。
3、内核访问用户空间（通过系统调用进入内核，有进程的上下文current)
---> 访问当前进程空间。内核写一个只读的内存。
---> 访问其他进程空间。通过系统调用的参数传递到内核空间的，但是线性地址不属于当前进程。
---> 内核bug或硬件错误访问一个用户空间地址。如空指针引用bug。
4、访问内核空间。试图写一个没被映射的内核地址。

引起缺页异常可以在用户空间和内核空间中触发，当CPU捕获到这个异常的时候就会引发一次缺页异常中断。由do_page_fault()函数来
判断和处理这些异常。我们看下内核是怎么处理引用NULL pointer这个异常的：

fastcall void __kprobes do_page_fault(struct pt_regs *regs,
unsigned long error_code)
{
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct * vma;
unsigned long address;
unsigned long page;
int write, si_code;

/* 先通过cr2寄存器得到引发异常的那个线性地址 */
address = read_cr2();

tsk = current;

si_code = SEGV_MAPERR;

/* 接着判断一下这个线性地址是不是发生于内核空间 */
if (unlikely(address >= TASK_SIZE)) {
/* 如果是内核引用了一内核空间中一处无效地址，则通过vmalloc_fault进行修复 */
if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
return;
if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;
/* 如果不是继续跳转到bad_area_nosemaphore继续分析原因 */
goto bad_area_nosemaphore;
}

/* 以下用于处理线性地址处于用户空间的情况，注意内核和用户程序都有可能引用一个无效的用户地址 */
if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
local_irq_enable();

mm = tsk->mm;

/* 中断程序，不可延迟程序，临界区代码不能访问用户空间，跳到bad_area_nosemaphore继续分析原因 */
if (in_atomic() || !mm)
goto bad_area_nosemaphore;

if (!down_read_trylock(&mm->mmap_sem)) {
/* 内核访问用户空间, 通过系统调用的参数传递到内核空间的，但是线性地址不属于当前进程。*/
if ((error_code & 4) == 0 &&
!search_exception_tables(regs->eip))
goto bad_area_nosemaphore;
down_read(&mm->mmap_sem);
}
bad_area:
up_read(&mm->mmap_sem);

bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
if (error_code & 4) {
/* 如果是用户进程访问了其他进程的空间，就杀死当前进程 */
if (is_prefetch(regs, address, error_code))
return;

tsk->thread.cr2 = address;
/* Kernel addresses are always protection faults */
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk);
return;
}

/* 如果是由于内核自己访问了用户空间的无效地址，则就会引发OOPS，
if (oops_may_print()) {
/* 如果这个地址小于PAGE_SIZE, 一般为4096字节，内核就认为这是一次空指针操作，开始打印OOPS信息，杀死当前进程 */
if (address < PAGE_SIZE)
printk(KERN_ALERT "BUG: unable to handle kernel NULL "
"pointer dereference");
else
printk(KERN_ALERT "BUG: unable to handle kernel paging"
" request");
printk(" at virtual address %08lx\n",address);
printk(KERN_ALERT " printing eip:\n");
printk("%08lx\n", regs->eip);
}
page = read_cr3();
page = ((unsigned long *) __va(page))[address >> 22];
if (oops_may_print())
printk(KERN_ALERT "*pde = %08lx\n", page);

force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
}

应用调试之配置内核输出应用程序的段错误信息

猜你喜欢