trace系列4 - kretprobe学习笔记

1.前言

本文主要是根据阅码场 《Linux内核tracers的实现原理与应用》视频课程在aarch64上的实践。通过观察钩子函数的创建过程以及替换过程,理解trace的原理。本文同样以blk_update_request函数为例进行说明kretprobe的工作原理,此处的kretprobe是基于trace event来实现,同时使用了ftrace的框架。

kernel版本:5.10
平台:arm64

2. kretprobe领域模型

在这里插入图片描述
trace系列3 - kretprobe学习笔记
kretprobe_instance : 记录了原始的返回地址,以及所属的kretprobe,作为kretprobe实例连入kretprobe的free_instancesl链表,当kretprobe_instance 被初始化后,它将从free_instancesl链表移除;重新连入全局kretprobe_inst_table链表

3. kretprobe创建

在执行如下指令时,会完成kretprobe的创建:

#echo 'r:blk_update blk_update_request $retval' > /sys/kernel/debug/tracing/kprobe_events

此过程主要通过调用create_or_delete_trace_kprobe,最主要的设置pre_handler为pre_handler_kretprobe,同时设置了打印格式,并完成trace_kprobe的注册。与kprobe创建时的主要区别在于:rp->kp.pre_handler初始化和kretprobe.handler初始化

|- -rp->kp.pre_handler初始化

create_or_delete_trace_kprobe -> 
    trace_kprobe_create ->
        register_trace_kprobe -> 
            __register_trace_kprobe

会调用register_kretprobe,它初始化了pre_handler为pre_handler_kretprobe

int register_kretprobe(struct kretprobe *rp)
{
    
    
        int ret = 0;
        struct kretprobe_instance *inst;
        int i;
        void *addr;

        if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
                return -EINVAL;

        if (kretprobe_blacklist_size) {
    
    
                addr = kprobe_addr(&rp->kp);
                if (IS_ERR(addr))
                        return PTR_ERR(addr);

                for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
    
    
                        if (kretprobe_blacklist[i].addr == addr)
                                return -EINVAL;
                }
        }
        //初始化pre_handler回调
        rp->kp.pre_handler = pre_handler_kretprobe;
        rp->kp.post_handler = NULL;
        rp->kp.fault_handler = NULL;

        /* Pre-allocate memory for max kretprobe instances */
        if (rp->maxactive <= 0) {
    
    
#ifdef CONFIG_PREEMPTION
                //此处为10
                rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
                rp->maxactive = num_possible_cpus();
#endif
        }
        raw_spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        //本例中rp->maxactive为10,循环创建10个kretprobe_instance,并连入kretprobe.free_instances链表
        //此处可以看出一个kretprobe可以有多个kretprobe_instance实例
        for (i = 0; i < rp->maxactive; i++) {
    
    
                inst = kmalloc(sizeof(struct kretprobe_instance) +
                               rp->data_size, GFP_KERNEL);
                if (inst == NULL) {
    
    
                        free_rp_inst(rp);
                        return -ENOMEM;
                }    
                INIT_HLIST_NODE(&inst->hlist);
                hlist_add_head(&inst->hlist, &rp->free_instances);
        }

        rp->nmissed = 0;
        /* Establish function entry probe point */
        ret = register_kprobe(&rp->kp);
        if (ret != 0)
                free_rp_inst(rp);
        return ret;
}

|- -kretprobe.handler初始化

create_or_delete_trace_kprobe -> 
    trace_kprobe_create ->
       alloc_trace_kprobe
  • alloc_trace_kprobe:为trace_kprobe分配空间,主要初始化了kreprobe的handler为kretprobe_dispatcher

4. kretprobe brk指令替换

先来看下未替换指令前blk_update_request的反汇编:

Dump of assembler code for function blk_update_request:
   0xffff8000104ec1f0 <+0>:     sub     sp, sp, #0x60
   0xffff8000104ec1f4 <+4>:     stp     x29, x30, [sp,#16]
   0xffff8000104ec1f8 <+8>:     add     x29, sp, #0x10
   0xffff8000104ec1fc <+12>:    stp     x19, x20, [sp,#32]
   0xffff8000104ec200 <+16>:    stp     x21, x22, [sp,#48]
   0xffff8000104ec204 <+20>:    stp     x23, x24, [sp,#64]
   0xffff8000104ec208 <+24>:    str     x25, [sp,#80]
   0xffff8000104ec20c <+28>:    mov     x22, x0
   0xffff8000104ec210 <+32>:    uxtb    w24, w1
   0xffff8000104ec214 <+36>:    mov     w21, w2
   0xffff8000104ec218 <+40>:    mov     x0, x30
   0xffff8000104ec21c <+44>:    nop
   ......

在执行如下命令后

# echo 1 >/sys/kernel/debug/tracing/events/kprobes/blk_update/enable 

我们可以看到,在执行如上操作后,blk_update_request的入口处的指令

sub     sp, sp, #0x60

被替换为:

0xffff8000104ec1f0 <+0>:     brk     #0x4

很奇怪居然与kprobe是一致的,主要调用了如下的函数,enable_kprobe与使能kprobe是一致的

static inline int enable_kretprobe(struct kretprobe *rp)
{
    
    
        return enable_kprobe(&rp->kp);
}

5. kretprobe钩子函数的执行

与前述kprobe的执行路径相同,当触发kretprobe执行时会按如下的执行路径,区别是执行的pre_handler不同:

#0  kprobe_handler (regs=0xffff80001253bcf0) at arch/arm64/kernel/probes/kprobes.c:352
#1  kprobe_breakpoint_handler (regs=0xffff80001253bcf0, esr=<optimized out>) at arch/arm64/kernel/probes/kprobes.c:404
#2  0xffff8000100148c4 in call_break_hook (regs=regs@entry=0xffff80001253bcf0, esr=esr@entry=4060086276) at arch/arm64/kernel/debug-monitors.c:322
#3  0xffff800010014a00 in brk_handler (unused=<optimized out>, esr=4060086276, regs=0xffff80001253bcf0) at arch/arm64/kernel/debug-monitors.c:329
#4  0xffff800010036180 in do_debug_exception (addr_if_watchpoint=addr_if_watchpoint@entry=5651652, esr=esr@entry=4060086276, regs=regs@entry=0xffff80001253bcf0) at arch/arm64/mm/fault.c:848
#5  0xffff800010cad220 in el1_dbg (regs=0xffff80001253bcf0, esr=4060086276) at arch/arm64/kernel/entry-common.c:190
#6  0xffff800010cad468 in el1_sync_handler (regs=<optimized out>) at arch/arm64/kernel/entry-common.c:227
#7  0xffff8000100119bc in el1_sync () at arch/arm64/kernel/entry.S:627

|- -pre_handler_kretprobe

对于kretprobe则会执行pre_handler_kretprobe回调:

int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
    |--struct kretprobe_instance *ri = NULL, *last = NULL;
    |--struct kretprobe *rp = container_of(p, struct kretprobe, kp);
    |--hash = hash_ptr(current, KPROBE_HASH_BITS);
    |--if (!hlist_empty(&rp->free_instances))
           // 从kretprobe->free_instances的实例链表中,找到空闲的kretprobe_instance实例
           ri = hlist_entry(rp->free_instances.first,struct kretprobe_instance, hlist);
           //从kretprobe->free_instances的实例链表中,删除此实例
           hlist_del(&ri->hlist);
           //初始化找到的空闲kretprobe_instance实例
           ri->rp = rp;
           ri->task = current;
           arch_prepare_kretprobe(ri, regs);
           INIT_HLIST_NODE(&ri->hlist);
           //将初始化的kretprobe_instance实例连入全局kretprobe_inst_table哈希链表
           hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);

pre_handler_kretprobe执行完毕后会执行arch_prepare_kretprobe,它主要保存了原始返回地址用于恢复,并设置了临时返回地址为kretprobe_trampoline用于执行kretprobe的功能

arch_prepare_kretprobe(struct kretprobe_instance *ri,struct pt_regs *regs)
    |  // 初始化kretprobe_instance实例为原始的返回地址,即blk_mq_end_request的返回地址,
    |  // 当从kretprobe_trampoline返回时用于恢复原有执行路径
    |--ri->ret_addr = (kprobe_opcode_t *)regs->regs[30];
    |  //初始化栈帧
    |--ri->fp = (void *)kernel_stack_pointer(regs);
    |  /* replace return addr (x30) with trampoline */
    |  //更新了返回地址,这样在从blk_update_request返回时会执行kretprobe_trampoline函数
    |--regs->regs[30] = (long)&kretprobe_trampoline;

|- -setup_singlestep

setup_singlestep(p, regs, kcb, 0)
    |--unsigned long slot;
    |--kcb->kprobe_status = KPROBE_HIT_SS;
    |--if (p->ainsn.api.insn)
           //slot存放了blk_update_request的入口指令:sub     sp, sp, #0x60
           slot = (unsigned long)p->ainsn.api.insn;
           set_ss_context(kcb, slot);
               |--kcb->ss_ctx.ss_pending = true;
               |  //slot(kcb->ss_ctx.match_addr)同时存放了指令: brk     #0x6
               |--kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t);
           kprobes_save_local_irqflag(kcb, regs);
           instruction_pointer_set(regs, slot);
               |  //将regs->pc赋值为val, 此处val就是slot, 它对应指令为sub     sp, sp, #0x60
               |--regs->pc = val

instruction_pointer_set设置了当断点指令brk #0x4返回执行的pc值,它就是blk_update_request原始的入口指令,当断点指令brk #0x4异常返回后,将执行blk_update_request的原始入口指令(注意:它位于另一个内存地址p->ainsn.api.insn,非原始内存地址)。由于slot槽同时还有一条端点指令brk #0x6,因此会继续执行断点指令brk #0x6

|- -brk #0x6

0xffff800012533000      sub    sp, sp, #0x60                                                                                                                                                                                        
0xffff800012533004      brk    #0x6 

执行slot槽指令,后将再次陷入断点异常,之后从断点异常退出后,将继续沿着blk_update_request原有的执行路径执行,这部分执行与kprobe没有任何区别,直到执行到函数返回处,由于pre_handler_kretprobe -> arch_prepare_kretprobe函数中替换了返回地址,因此从blk_update_request函数返回后将不会按照原有的返回地址执行,而是会执行设置的返回地址,即kretprobe_trampoline

|- -kretprobe_trampoline

SYM_CODE_START(kretprobe_trampoline)
        //kretprobe_trampoline相当于占据了blk_update_request的栈(鸠占鹊巢)
        //此处分配栈空间,用于保存pt_regs寄存器
        sub sp, sp, #S_FRAME_SIZE
        //保存pt_regs寄存器
        save_all_base_regs
        //保存栈顶到x0, 即struct pt_regs指针
        mov x0, sp
        bl trampoline_probe_handler
        /*  
         * Replace trampoline address in lr with actual orig_ret_addr return
         * address.
         */
        mov lr, x0

        restore_all_base_regs

        add sp, sp, #S_FRAME_SIZE
        ret 
SYM_CODE_END(krtprobe_trampoline)
void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs)
{
    
    
        return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline,
                                        (void *)kernel_stack_pointer(regs));
}
static nokprobe_inline
unsigned long kretprobe_trampoline_handler(struct pt_regs *regs,
                                void *trampoline_address,
                                void *frame_pointer)
{
    
    
        unsigned long ret;
        /*
         * Set a dummy kprobe for avoiding kretprobe recursion.
         * Since kretprobe never runs in kprobe handler, no kprobe must
         * be running at this point.
         */
        kprobe_busy_begin();
        ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer);
        kprobe_busy_end();

        return ret;
}
__kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer)
    |--struct kretprobe_instance *ri = NULL, *last = NULL;
    |  struct hlist_head *head;
    |--kprobe_opcode_t *correct_ret_addr = NULL;
    |--kretprobe_hash_lock(current, &head, &flags);
    |      |  //kretprobe_inst_table哈希上链接了初始化的kretprobe_instance实例
    |      |--*head = &kretprobe_inst_table[hash];
    |  //遍历kretprobe_inst_table哈希上链接了初始化的kretprobe_instance实例
    |--hlist_for_each_entry(ri, head, hlist)
    |       //找到与blk_mq_end_request帧指针相同的kretprobe_instance实例
    |       if (ri->fp != frame_pointer)
    |           skipped = true;
    |           continue;
    |       //获取原始的返回地址, 用于trampoline_address执行完毕后返回到blk_mq_end_request
    |       correct_ret_addr = ri->ret_addr;
    |       if (correct_ret_addr != trampoline_address)
    |           break;
    |--last = ri
    |--hlist_for_each_entry_safe(ri, tmp, head, hlist)
    |      if (ri->task != current) 
    |          continue;
    |      if (ri->fp != frame_pointer)
    |          continue;
    |      if (ri->rp && ri->rp->handler)
    |          struct kprobe *prev = kprobe_running()
    |           ri->ret_addr = correct_ret_addr;
    |           ri->rp->handler(ri, regs);
    |      recycle_rp_inst(ri);
    \--return (unsigned long)correct_ret_addr;
    

根据前面初始化,ri->rp->handler为kretprobe_dispatcher

static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
    
    
        struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);

        raw_cpu_inc(*tk->nhit);

        if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE))
                kretprobe_trace_func(tk, ri, regs);
#ifdef CONFIG_PERF_EVENTS
        if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE))
                kretprobe_perf_func(tk, ri, regs);
#endif
        return 0;       /* We don't tweek kernel, so just return 0 */
}
static void
kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                     struct pt_regs *regs)
{
    
    
        struct event_file_link *link;

        trace_probe_for_each_link_rcu(link, &tk->tp)
                __kretprobe_trace_func(tk, ri, regs, link->file);
}
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                       struct pt_regs *regs,
                       struct trace_event_file *trace_file)
{
    
    
        struct kretprobe_trace_entry_head *entry;
        struct trace_event_buffer fbuffer;
        struct trace_event_call *call = trace_probe_event_call(&tk->tp);
        int dsize;

        WARN_ON(call != trace_file->event_call);

        if (trace_trigger_soft_disabled(trace_file))
                return;

        local_save_flags(fbuffer.flags);
        fbuffer.pc = preempt_count();
        fbuffer.trace_file = trace_file;

        dsize = __get_data_size(&tk->tp, regs);
        fbuffer.event =
                trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file,
                                        call->event.type,
                                        sizeof(*entry) + tk->tp.size + dsize,
                                        fbuffer.flags, fbuffer.pc);
        if (!fbuffer.event)
                return;

        fbuffer.regs = regs;
        entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
        entry->func = (unsigned long)tk->rp.kp.addr;
        //设置返回地址为原始的返回地址
        entry->ret_ip = (unsigned long)ri->ret_addr;
        //存储参数值
        store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
        //写入ring buffer
        trace_event_buffer_commit(&fbuffer);
}

由于trampoline_probe_handler最终返回的是原始的返回地址,会被保存在x0中,回到kretprobe_trampoline中,
mov lr, x0 用来恢复链接寄存器,从kretprobe_trampoline返回后将返回到blk_mq_end_request的原始路径执行

6. 总结

我们再来简单总结kretprobe的工作流程:

  1. 首先要注册kretprobe
    这主要是通过向/sys/kernel/debug/tracing/kprobe_events节点写入命令完成,这个过程将会:
    (1)完成kretprobe的注册,这其中最重要的是初始化pre_handler回调为pre_handler_kretprobe,它将在brk #0x4断点处理函数中被调用,它主要保存从blk_update_request返回的原始的返回地址,同时设置了临时的返回函数为kretprobe_trampoline用于执行kretprobe功能;
    (2)保存被探测函数入口的原始指令,再加上一条brk #0x6断点指令,它们会被保存到slot中,将来被替换的brk #0x4返回后将首先执行此slot中原始的指令代码;
    (3)同时也会记录探测点的后一条指令地址,将来从brk #0x6返回时将执行此指令,从而恢复原始的指令执行路径;

  2. 断点指令插入
    主要通过echo 1 > /sys/kernel/debug/tracing/events/kprobes/blk_update/enable完成。它将会将被探测函数探测点的指令替换为brk #0x4。
    注:brk #0x4和brk #0x6将对应不同的断点处理回调

  3. 执行kretprobe回调
    当进入被探测函数探测点时,会执行brk #0x4断点指令引发断点异常,根据0x4参数将执行断点立即处理回调,最终将执行pre_handler_kretprobe回调,主要用于设置blk_update_request的原始返回地址;之后将执行第一步初始化好的slot槽中的指令,slot槽的第一条指令就是被探测函数原始入口执行的指令,之后将执行brk #0x6再次陷入断点异常,此时根据参数0x6将执行断点单步异常处理函数,它将会通过将第1步(3)中记录的指令地址恢复PC,这样brk #0x6返回时,将继续沿着被探测函数探测点之后的指令路径执行,恢复正常的指令执行路径。在blk_update_request返回处,会跳转到临时返回地址kretprobe_trampoline,完成kretprobe的功能,之后将返回地址修改为blk_update_request原始的返回地址,这样从kretprobe_trampoline返回后,将返回到blk_mq_end_request原始的返回地址继续执行。

执行结果如下:

/ # cat /sys/kernel/debug/tracing/trace
# tracer: nop
#
# entries-in-buffer/entries-written: 17/17   #P:2
#
#                                _-----=> irqs-off
#                               / _----=> need-resched
#                              | / _---=> hardirq/softirq
#                              || / _--=> preempt-depth
#                              ||| /     delay
#           TASK-PID     CPU#  ||||   TIMESTAMP  FUNCTION
#              | |         |   ||||      |         |
          <idle>-0       [000] d.s3    16.343746: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
          <idle>-0       [000] d.s3    16.345540: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
          <idle>-0       [000] d.s4    16.346275: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] d.s2    16.348317: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] d.s3    16.348719: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
    kworker/u4:0-7       [000] d.s3    35.025022: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
    kworker/u4:0-7       [000] d.s3    40.146858: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
          <idle>-0       [000] d.s3    45.536322: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] d.s2    45.566258: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] d.s2    45.599543: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] dNs3    45.612681: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
             cat-116     [000] dNs3    51.593771: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
          <idle>-0       [000] d.s3    56.762610: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
    kworker/0:1H-97      [000] d.s3    56.794490: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
      jbd2/vda-8-103     [000] d.s4    56.814426: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] d.s2    56.825756: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0
     ksoftirqd/0-9       [000] d.s3    56.826898: blk_update: (blk_mq_end_request+0x30/0x10c <- blk_update_request) arg1=0x0

附录

struct kretprobe {
    
    
        struct kprobe kp;
        kretprobe_handler_t handler;
        kretprobe_handler_t entry_handler;
        int maxactive;
        int nmissed;
        size_t data_size;
        struct hlist_head free_instances;
        raw_spinlock_t lock;
}
struct kretprobe_instance {
    
    
        union {
    
    
                struct hlist_node hlist;
                struct rcu_head rcu;
        };
        struct kretprobe *rp;
        //保存原始的返回地址
        kprobe_opcode_t *ret_addr;
        struct task_struct *task;
        void *fp;
        char data[];
};

猜你喜欢

转载自blog.csdn.net/jasonactions/article/details/121065795