linux crash debug kernel exception

一、crash介绍：

　　ramdump是分析kernel crash的资料，利用工具想gdb debug coredump一样直接debug ramdump。

该工具在gdb上开发，使它可以支持kernel，因此使用方法和gdb一样，只不过增加了支持kernel的一些命令。

crash官方网站：http://people.redhat.com/anderson/如何编译crash，然后用它来debug ramdump呢？

1. 从官方网站下载crash源代码。

2. 编译前确保必要的组件（ncurese和zlib），如果没有需要：

sudo apt-get install libncurses5-dev
sudo apt-get install zlib1g-dev

2. 解压后编译ARM 32bit的crash：

cd crash-7.1.0

make target=ARM

3. 如果是ARM 64bit的，则是：

cd crash-7.1.0
make target=ARM64

4. 编译后会在当前目录下生成crash，可以将多余的符号去除

strip -s crash

5. 当发生kernel crash时，会有db生成，用GAT的logviewer解开db，里面有SYS_COREDUMP，结合对应的

vmlinux（必须是烧录前备份的vmlinux！），即可用crash debug：

crash vmlinux SYS_COREDUMP

6. crash常用命令：

ps 
struct     struct vm_area_struct c1e44f10
list       list task_struct.p_pptr c169a000
task       task -x
search     search -u deadbeef
bt         bt <pid>
p
dis        dis sys_signal
rd         rd -a linux_banner / rd [start_addr] –e [end_addr]
wr

二、问题背景：

　　mtk平台出现hang_detect重启，查看watchdog和其他D状态的进程：

watchdog        D 1309.915790 1165 375 1286 0x400140 0x80 969 5002309695283
<c0ef26e8> __schedule+0x488/0x10d4
<c0ef3388> schedule+0x54/0xbc
<c0ef8400> rwsem_down_read_failed+0xd4/0x124
<c0ef7454> down_read+0x78/0x7c
<c0318bdc> iterate_dir+0x48/0x198
<c03194fc> SyS_getdents64+0x90/0x158

asiainno.uplive D 912.053054 13523 1257 14 0x400140 0x80 13523 5373236247475
<c0ef26e8> __schedule+0x488/0x10d4
<c0ef3388> schedule+0x54/0xbc
<c0ef86e8> schedule_timeout+0x200/0x5bc
<c0ef2228> io_schedule_timeout+0x80/0xb8
<c046e93c> f2fs_wait_on_all_pages_writeback+0x68/0xbc
<c046f018> do_checkpoint+0x688/0xd58
<c046fe38> f2fs_write_checkpoint+0x750/0xb58
<c046149c> f2fs_sync_fs+0x10c/0x2bc
<c0449fc0> f2fs_do_sync_file+0x1f8/0xda4                    
<c044abb0> f2fs_sync_file+0x44/0x58
<c0341478> SyS_fdatasync+0x54/0xac

这里我们主要看f2fs_wait_on_all_pages_writeback为何出现schedule_timeout，后面使用crash推导整个过程：

1、我们先看f2fs_wait_on_all_pages_writeback代码：

1204 void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
1205 {
1206     DEFINE_WAIT(wait);
1207 
1208     for (;;) {
1209         prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
1210 
1211         if (!get_pages(sbi, F2FS_WB_CP_DATA))
1212             break;
1213     
1214         if (unlikely(f2fs_cp_error(sbi)))
1215             break;
1216 
1217         io_schedule_timeout(5*HZ);
1218     }
1219     finish_wait(&sbi->cp_wait, &wait);
1220 }

这里我们怀疑是get_pages(sbi, F2FS_WB_CP_DATA) 一直大于 0，导致的io_schedule_timeout：

1873 static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
1874 {           
1875     return atomic_read(&sbi->nr_pages[count_type]);
1876 }

到这里，就到了问题的关键：sbi->nr_pages[count_type] 是否是一直大于0？后面利用crash继续推导。

三、crash推导问题：

1、加载crash：

~/problem/crash$ ../shell/crash vmlinux SYS_COREDUMP 这时报了如下错误：

crash: read error: kernel virtual address: c0f06474  type: "kernel_config_data"
WARNING: cannot read kernel_config_data
crash: read error: kernel virtual address: c1614830  type: "possible"
WARNING: cannot read cpu_possible_map
crash: read error: kernel virtual address: c1614834  type: "present"
WARNING: cannot read cpu_present_map
crash: read error: kernel virtual address: c161482c  type: "online"
WARNING: cannot read cpu_online_map
crash: read error: kernel virtual address: c161483c  type: "active"
WARNING: cannot read cpu_active_map
crash: read error: kernel virtual address: c218dd20  type: "shadow_timekeeper xtime_sec"
crash: read error: kernel virtual address: c161c560  type: "init_uts_ns"
crash: vmlinux and SYS_COREDUMP do not match!

查询资料得知，需要指定DDR起始地址：

	memory {
		device_type = "memory";
		reg = <0 0x40000000 0 0x20000000>;
	};

../shell/crash -m phys_base=0x40000000 vmlinux SYS_COREDUMP

这样crash就加载成功了。

2、通过crash看下进程的调用栈：

crash> bt 13523    //--> 此进程号可以通过上面堆栈获取
PID: 13523  TASK: d6cec000  CPU: 1   COMMAND: "asiainno.uplive"
 #0 [<c0ef2264>] (__schedule) from [<c0ef3388>]
 #1 [<c0ef3338>] (schedule) from [<c0ef86e8>]
 #2 [<c0ef84ec>] (schedule_timeout) from [<c0ef2228>]
 #3 [<c0ef21ac>] (io_schedule_timeout) from [<c046e93c>]
 #4 [<c046e8d8>] (f2fs_wait_on_all_pages_writeback) from [<c046f018>]
 #5 [<c046e994>] (do_checkpoint) from [<c046fe38>]
 #6 [<c046f6ec>] (f2fs_write_checkpoint) from [<c046149c>]
 #7 [<c0461394>] (f2fs_sync_fs) from [<c0449fc0>]
 #8 [<c0449dcc>] (f2fs_do_sync_file) from [<c044abb0>]
 #9 [<c044ab70>] (f2fs_sync_file) from [<c0341478>]
#10 [<c0341428>] (sys_fdatasync) from [<c0109aec>]

这里可以看到TASK: d6cec000，这个指向进程的task_struct，可以通过crash打印出来其信息：

crash> struct task_struct 0xd6cec000
struct task_struct {
  state = 2, 
  stack = 0xd90ba000, 
  usage = {
    counter = 4
  }, 
......
}

3、这里可以通过stack = 0xd90ba000 推导出大致的SP的地址，通过如下原理：

栈的的增长方向向下，从高地址向低地址开增长，在32bit系统中每个进程的栈空间为8KB（64bit为16KB），

这里已经知道stack的地址，通过公式：stack_add + 8KB + 当前使用栈大小，就可以把这部分的内存全部dump

出来推导上面的sbi。

4、这里有需要计算：当前使用栈大小，这个通过如下方法计算：

crash> disas SyS_fdatasync
   0xc0341424 <+0>:     mov     r12, sp
   0xc0341428 <+4>:     push    {r4, r5, r6, r7, r11, r12, lr, pc}  //32
   0xc0341430 <+12>:    sub     sp, sp, #16

crash> disas f2fs_sync_file
   0xc044ab70 <+4>:     push    {r4, r5, r11, r12, lr, pc}  //24
   0xc044ab78 <+12>:    sub     sp, sp, #16

crash> disas f2fs_do_sync_file
   0xc0449dcc <+4>:     push    {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr, pc}  44
   0xc0449dd4 <+12>:    sub     sp, sp, #108    ; 0x6c

crash> disas f2fs_sync_fs
   0xc0461394 <+4>:     push    {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr, pc}  44
   0xc046139c <+12>:    sub     sp, sp, #52     ; 0x34

crash> disas f2fs_write_checkpoint
   0xc046f6ec <+4>:     push    {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr, pc}  44
   0xc046f6f0 <+8>:     sub     r11, r12, #4

crash> disas do_checkpoint
   0xc046e994 <+4>:     push    {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr, pc}  44
   0xc046e99c <+12>:    sub     sp, sp, #92     ; 0x5c
   0xc046e9b8 <+40>:    sub     sp, sp, #192    ; 0xc0

crash> disas f2fs_wait_on_all_pages_writeback
   0xc046e8d8 <+4>:     push    {r4, r5, r6, r11, r12, lr, pc}  28
   0xc046e8e0 <+12>:    sub     sp, sp, #28

每个寄存器占用的大小为4字节，sub sp, sp, #28 //sp自减操作是为局部变量分配栈空间，将进程中的

寄存器占用的空间和局部变量的空间相加可以得到当前进程所使用的栈空间的大小，这里相加得到1048字节（以1200byte计算）：

0xd90ba000 + 0x2000(8KB) + 1200 = 0xD90BBB50即可以打印出当前进程所以的内存情况如下：

crash> rd 0xD90BBC00 -e 0xD90BC000
d90bbc00:  000005dc 003d47a0 d90bbe60 c1614d14   .....G=.`....Ma.
d90bbc10:  d90bbc3c d90bbc20 c0ef2228 c0ef84f4   <... ...("......
d90bbc20:  db1a5000 db1a5560 c1614808 00000205   .P..`U...Ha.....
d90bbc30:  d90bbc74 d90bbc40 c046e93c c0ef21b4   t...@...<.F..!..
//f2fs_wait_on_all_pages_writeback start
d90bbc40:  00000000 d6cec000 c01a9b8c db1a5598   .............U..
d90bbc50:  db1a5598 00040975 00000205 db1a5000//r4   .U..u........P..
d90bbc60:  dadde000//r5 c161494c//r6 d90bbdbc//r11 d90bbc78//r12   ....LIa.....x...
d90bbc70:  c046f018//lr c046e8e0//pc 

c01c83f4 c01e5154   ..F...F.....TQ..
//f2fs_wait_on_all_pages_writeback end

//从如下汇编可以推导出对应的栈，其他函数相同
crash> disas f2fs_wait_on_all_pages_writeback
Dump of assembler code for function f2fs_wait_on_all_pages_writeback:
   0xc046e8d4 <+0>:     mov     r12, sp
   0xc046e8d8 <+4>:     push    {r4, r5, r6, r11, r12, lr, pc}
   0xc046e8dc <+8>:     sub     r11, r12, #4
   0xc046e8e0 <+12>:    sub     sp, sp, #28

//这里根据经验可知f2fs_wait_on_all_pages_writeback的PC =  0xc046e8e0
//及入栈操作的下两行
   0xc046e8e4 <+16>:    mov     r3, sp
   0xc046e8e8 <+20>:    bic     r3, r3, #8128   ; 0x1fc0

到这里知道了r4 = 0xdb1a5000，再来看上一帧的汇编：

crash> disas do_checkpoint
Dump of assembler code for function do_checkpoint:
   0xc046e990 <+0>:     mov     r12, sp
   0xc046e994 <+4>:     push    {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr, pc}
   0xc046e998 <+8>:     sub     r11, r12, #4
   0xc046e99c <+12>:    sub     sp, sp, #92     ; 0x5c
   .......
   0xc046f21c <+2188>:  mov     r0, r4 //这里的r4 就对应sbi
   0xc046f220 <+2192>:  bl      0xc0496bb8 <f2fs_write_node_summaries>

上面使用栈的规则推导出对应的内存信息，也可以直接通过crash的如下命令直接将对应信息打印出来：

crash> bt -f 13523 
PID: 13523  TASK: d6cec000  CPU: 1   COMMAND: "asiainno.uplive"
 #0 [<c0ef2264>] (__schedule) from [<c0ef3388>]
    [PC: c0ef2264  LR: c0ef3388  SP: d90bbb20  FP: d90bbba4  SIZE: 112]
    d90bbb20: 3746cf21 000004ea 00000000 00000000 
    d90bbb30: d90bbb74 d90bbb40 c0209af4 c0b9afd4 
    d90bbb40: c0ef3388 d6cec430 34e4e7dc 000004ea 
    d90bbb50: df67dae8 df67e020 34e55455 000004ea 
    d90bbb60: 400d0013 d90ba000 c1614808 c1606100 
    d90bbb70: d90bbbc4 df677480 00000001 c17c6ff4 
    d90bbb80: d90bbba4 d90bbb90 c0ef3388 c0ef226c 
 #1 [<c0ef3338>] (schedule) from [<c0ef86e8>]
    [PC: c0ef3338  LR: c0ef86e8  SP: d90bbb90  FP: d90bbc1c  SIZE: 24]
    d90bbb90: 00000080 0017631c d90bbc1c d90bbba8 
    d90bbba0: c0ef86e8 c0ef3340 
 #2 [<c0ef84ec>] (schedule_timeout) from [<c0ef2228>]
    [PC: c0ef84ec  LR: c0ef2228  SP: d90bbba8  FP: d90bbc3c  SIZE: 120]
    d90bbba8: 722fbb86 000004c8 db1a5000 ffffffff 
    d90bbbb8: df677480 00000000 400d0013 00000000 
    d90bbbc8: df677748 0017631c c01f1e7c d6cec000 
    d90bbbd8: 23400001 c218cf4c 00000000 00000000 
    d90bbbe8: c1251e34 00040975 c0ef989c df67dac0 
    d90bbbf8: df67e030 00000000 000005dc 003d47a0 
    d90bbc08: d90bbe60 c1614d14 d90bbc3c d90bbc20 
    d90bbc18: c0ef2228 c0ef84f4 
 #3 [<c0ef21ac>] (io_schedule_timeout) from [<c046e93c>]
    [PC: c0ef21ac  LR: c046e93c  SP: d90bbc20  FP: d90bbc74  SIZE: 32]
    d90bbc20: db1a5000 db1a5560 c1614808 00000205 
    d90bbc30: d90bbc74 d90bbc40 c046e93c c0ef21b4 
 #4 [<c046e8d8>] (f2fs_wait_on_all_pages_writeback) from [<c046f018>]
    [PC: c046e8d8  LR: c046f018  SP: d90bbc40  FP: d90bbdbc  SIZE: 56]
    d90bbc40: 00000000 d6cec000 c01a9b8c db1a5598 
    d90bbc50: db1a5598 00040975 00000205 db1a5000 
    d90bbc60: dadde000 c161494c d90bbdbc d90bbc78 
    d90bbc70: c046f018//lr c046e8e0//pc 
 #5 [<c046e994>] (do_checkpoint) from [<c046fe38>]
    [PC: c046e994  LR: c046fe38  SP: d90bbc78  FP: d90bbe54  SIZE: 328]
    d90bbc78: c01c83f4 c01e5154 db133d80 00000000 
    d90bbc88: d90bbcdc d90bbc98 c02e96f4 00000000 
    d90bbc98: db3c4b70 60000113 c1f8a0e0 c0ef6f48 
    d90bbca8: d90bbcc4 60000193 db3c4b68 d90bbce4 
    d90bbcb8: 00000080 db3c4b68 4c23cc10 c1614808

5、分析到这里终于找到了sbi的地址，下面就可以尽情的打印出器信息：

crash> struct f2fs_sb_info.nr_pages 0xdb1a5000
  nr_pages = {{
      counter = 0
    }, {
      counter = 2224
    }, {
      counter = 0
    }, {
      counter = 36
    }, {
      counter = 0
    }, {
      counter = 0
    }, {
      counter = 1
    }, {
      counter = -76
    }, {
      counter = 76
    }, {
      counter = 0
    }, {
      counter = 0
    }, {
      counter = 0
    }},

在看如下F2FS_WB_CP_DATA得知counter = 76，一直大于0，导致的io_schedule_timeout：

 958 enum count_type {
 959     F2FS_DIRTY_DENTS,
 960     F2FS_DIRTY_DATA,
 961     F2FS_DIRTY_QDATA,
 962     F2FS_DIRTY_NODES,
 963     F2FS_DIRTY_META,
 964     F2FS_INMEM_PAGES,
 965     F2FS_DIRTY_IMETA,
 966     F2FS_WB_CP_DATA,
 967     F2FS_WB_DATA,
 968     F2FS_RD_DATA,
 969     F2FS_RD_NODE,
 970     F2FS_RD_META,
 971     NR_COUNT_TYPE,
 972 };

到这里问题就清晰了，最后此问题，是f2fs社区中一笔patch可以解决此问题，但分析的过程需要使用很多知识。

作者：frank_zyp
您的支持是对博主最大的鼓励，感谢您的认真阅读。
本文无所谓版权，欢迎转载。

frank_zyp

发布了59 篇原创文章 · 获赞 80 · 访问量 5万+

私信关注

linux crash debug kernel exception

猜你喜欢