出问题附近出现了anr, 很多进程blocked。 ps 看UN状态
crash_arm> ps | grep UN
4 2 0 eea39440 UN 0.0 0 0 [kworker/0:0]
106 2 2 ed9c6540 UN 0.0 0 0 [bm_perf]
172 2 2 ed2357c0 UN 0.0 0 0 [tx_core_thread]
173 2 0 ed235e80 UN 0.0 0 0 [tx_trans_thread]
174 2 2 ed236540 UN 0.0 0 0 [rx_core_thread]
175 2 2 ed236c00 UN 0.0 0 0 [rx_trans_thread]
351 2 2 e8d472c0 UN 0.0 0 0 [kworker/u8:3]
481 2 0 ea8457c0 UN 0.0 0 0 [slp_isr_thread]
5537 6694 1 e1064a40 UN 1.8 1644776 77052 Binder:5517_2
5736 6694 1 e10506c0 UN 1.8 1644776 77052 Binder:5517_3
5759 6694 1 e06c3cc0 UN 1.5 1633860 62696 Binder:5745_1
5865 6694 1 d2a30000 UN 2.6 1682976 109576 Binder:5843_1
5869 6694 1 e188a880 UN 2.6 1682976 109576 Binder:5843_2
5905 6694 1 d2a31440 UN 1.6 1635080 64724 Binder:5892_2
6095 6694 1 e123ec00 UN 2.5 1685336 103292 Binder:6074_1
6255 6694 1 d221a880 UN 2.6 1682976 109576 Binder:5843_3
6317 6694 1 d2d606c0 UN 1.9 1657900 79304 Binder:6303_1
6593 4345 1 e2ee8d80 UN 2.5 1671636 103132 Binder:6581_1
6825 6694 1 de69a880 UN 2.5 1685336 103292 Binder:6074_1
6826 6694 1 de699b00 UN 2.6 1682976 109576 Binder:5843_2
6827 4345 1 e1062f40 UN 2.5 1671636 103132 Binder:6581_2
UN状态的binder有点多。 看每个栈都什么状态:
crash_arm> bt d2a30000
PID: 5865 TASK: d2a30000 CPU: 1 COMMAND: "Binder:5843_1"
#0 [<c0a27ef4>] (__schedule) from [<c0a2846c>]
#1 [<c0a2846c>] (schedule) from [<c0a2867c>]
#2 [<c0a2867c>] (schedule_preempt_disabled) from [<c0a29e08>]
#3 [<c0a29e08>] (__mutex_lock_slowpath) from [<c0a29eac>]
#4 [<c0a29eac>] (mutex_lock) from [<c0267ae8>]
#5 [<c0267ae8>] (__fdget_pos) from [<c024c248>]
#6 [<c024c248>] (sys_write) from [<c0108280>]
这个task是在等 mutex_lock(&file->f_pos_lock);
通过这个task的栈,其实这个muext是会不压栈的,所以能从栈里推出这个mutex是什么状态。
crash_arm> mutex e3339628
struct mutex {
count = {
counter = 0
},
wait_lock = {
{
rlock = {
raw_lock = {
{
slock = 196611,
tickets = {
owner = 3,
next = 3
}
}
}
}
}
},
wait_list = {
next = 0xe2c23f14,
prev = 0xebbf9f14
},
owner = 0xe2ee8d80,
osq = {
tail = {
counter = 0
}
}
}
//mutex 的owner,即指向持锁的task_struct
crash_arm> task_struct.comm 0xe2ee8d80
comm = "Binder:6581_1\000e"
crash_arm> bt d2a31440
PID: 5905 TASK: d2a31440 CPU: 1 COMMAND: "Binder:5892_2"
#0 [<c0a27ef4>] (__schedule) from [<c0a2846c>]
#1 [<c0a2846c>] (schedule) from [<c0a2867c>]
#2 [<c0a2867c>] (schedule_preempt_disabled) from [<c0a29e08>]
#3 [<c0a29e08>] (__mutex_lock_slowpath) from [<c0a29eac>]
#4 [<c0a29eac>] (mutex_lock) from [<c01b0a74>]
#5 [<c01b0a74>] (cgroup_kn_lock_live) from [<c01b3ed4>]
#6 [<c01b3ed4>] (__cgroup_procs_write) from [<c01b4188>]
#7 [<c01b4188>] (cgroup_tasks_write) from [<c01afff0>]
#8 [<c01afff0>] (cgroup_file_write) from [<c02b6bb8>]
#9 [<c02b6bb8>] (kernfs_fop_write) from [<c024b17c>]
#10 [<c024b17c>] (__vfs_write) from [<c024ba1c>]
#11 [<c024ba1c>] (vfs_write) from [<c024c270>]
#12 [<c024c270>] (sys_write) from [<c0108280>]
这个task是在等 mutex_lock(&cgroup_mutex); 正好是全局变量,直接看
crash_arm> cgroup_mutex
cgroup_mutex = $1 = {
count = {
counter = -1
},
wait_lock = {
{
rlock = {
raw_lock = {
{
slock = 265490387,
tickets = {
owner = 4051,
next = 4051
}
}
}
}
}
},
wait_list = {
next = 0xe1b57d84,
prev = 0xd36a5d84
},
owner = 0xe2ee8d80,
osq = {
tail = {
counter = 0
}
}
}
//mutex 的owner,即指向持锁的task_struct
crash_arm> task_struct.comm 0xe2ee8d80
comm = "Binder:6581_1\000e"
//上面两类task等待的锁都是被Binder:6581_1\000e 这个task持有了,那么这个Binder:6581_1\000e是啥情况呢?
crash_arm> bt e2ee8d80
PID: 6593 TASK: e2ee8d80 CPU: 1 COMMAND: "Binder:6581_1"
#0 [<c0a27ef4>] (__schedule) from [<c0a2846c>]
#1 [<c0a2846c>] (schedule) from [<c0a2acc8>]
#2 [<c0a2acc8>] (rwsem_down_write_failed) from [<c0a2a23c>]
#3 [<c0a2a23c>] (down_write) from [<c0174cd8>]
#4 [<c0174cd8>] (percpu_down_write) from [<c01b3ed4>]
#5 [<c01b3ed4>] (__cgroup_procs_write) from [<c01b4188>]
#6 [<c01b4188>] (cgroup_tasks_write) from [<c01afff0>]
#7 [<c01afff0>] (cgroup_file_write) from [<c02b6bb8>]
#8 [<c02b6bb8>] (kernfs_fop_write) from [<c024b17c>]
#9 [<c024b17c>] (__vfs_write) from [<c024ba1c>]
#10 [<c024ba1c>] (vfs_write) from [<c024c270>]
#11 [<c024c270>] (sys_write) from [<c0108280>]
这个task是在拿读写信号量失败了,然后睡下去了。percpu_down_write(&cgroup_threadgroup_rwsem);
Block until there are no active lockers.
这里正好是全局变量,直接看其成员
crash_arm> cgroup_threadgroup_rwsem
rw_sem = {
count = -65536,
wait_list = {
next = 0xe0315d70,
prev = 0xe0315d70
},
wait_lock = {
raw_lock = {
{
slock = 26542485,
tickets = {
owner = 405,
next = 405
}
}
}
},
osq = {
tail = {
counter = 0
}
},
owner = 0x0
},
这个count 从内存里看
crash_arm> sym cgroup_threadgroup_rwsem
c123c8e0 (B) cgroup_threadgroup_rwsem
crash_arm> percpu_rw_semaphore -ox
struct percpu_rw_semaphore {
[0x0] struct rcu_sync rss;
[0x24] unsigned int *read_count;
[0x28] struct rw_semaphore rw_sem;
[0x40] wait_queue_head_t writer;
[0x4c] int readers_block;
}
SIZE: 0x50
crash_arm> rd C123C908 30
c123c908: ffff0000 e0315d70 e0315d70 01950195 ....p]1.p]1.....
c123c918: 00000000 00000000 326e326e c123c924 ........n2n2$.#.
c123c928: c123c924 00000000 0a850a85 ee803a00 $.#..........:..
c123c938: ec106a80 00000000 00000001 0000000f .j..............
c123c948: 00000001 0000000f 00000001 0000000f ................
count即是 ffff0000 ,owner为空,
kernel/locking/rwsem-xadd.c代码中有如下一段关于count值含义的比较全面的介绍。
·0x0000_0000:为初始化值,表示没有读者和写者。
·0x0000_000X:表示有X个活跃的读者或者正在申请的读者,没有写者干扰。
·0xffff_000X:可能是有X个活跃读者,还有写者正在睡眠等待;或者是有一个写者持有锁,还有多个读者正在睡眠等待。
·0xffff_0001:表示当前只有一个活跃的写者;或者一个活跃或者申请中的读者,还有写者正在睡眠等待。
·0xffff_0000:表示WAITING_BIAS,有读者或者写者正在睡眠等待,但是它们都还没成功获取锁。
RWSEM_WAITING_BIAS通常用于表示等待队列中还有其他正在排队的人。持有锁和释放锁时对count的操作是成对出现的,当判断count值等于RWSEM_WAITING_BIAS时,表示当前已经没有活跃的锁,即没有人持有锁,但有人在等待队列中。
假设等待队列为空,那么当前进程就是该等待队列上第一个客户,这里count值要加上RWSEM_WAITING_BIAS(−65536或者二元数[−1, 0]),adjustment值等于−65537,第16行代码执行完毕后,count值将变成−131071(sem->count+adjustment, −65534−65537)。
假设在第16行代码之后,持有写者锁的进程释放了锁,那么sem->count的值会变成多少呢?sem->count − RWSEM_ACTIVE_WRITE_BIAS = −65536,第22行代码的判断语句(count == RWSEM_WAITING_BIAS)恰巧可以捕捉到这个变化,调用__rwsem_do_wake()函数去唤醒在等待队列中睡眠的人们。
看下wait list ,确实有人等待,但似乎结构体都有点破坏,分析卡在这里了。
crash_arm> list rwsem_waiter.list -s rwsem_waiter.task,type 0xe0315d70
e0315d70
task = 0xc014a36c <atomic_notifier_call_chain+40>
type = (unknown: 3222577908)
e0315d94
task = 0xc0a28464 <schedule+172>
type = (unknown: 3731841120)
c014a350
task = 0xe8bd4000
type = (unknown: 3818926080)
e24dd008
task = 0x0
type = RWSEM_WAITING_FOR_WRITE