vdso原理学习

首先,借用一张图来说明 vdso 的原理,下面是描述 vdso 在 arm64 上的工作原理:


总的来说,vdso 分为用户态部分和内核态部分。

vdso 用户态部分提供 __vdso_xxx() 接口(或者是 __kernel_xxx() 接口,如上图所示),这些接口由glibc调用并将结果返回给用户程序,它们的实现也比较简单,就是读取 vdso_data 里面的内容并进行简单处理,然后将结果返回给 glibc。vdso_data 是一个结构体,这个数据结构也是跟体系结构相关的,下面是 arm64 的例子,它定义在 arch/arm64/include/asm/vdso_datapage.h 文件中:

struct vdso_data {                                                                                                                                                                                                                            
    __u64 cs_cycle_last;    /* Timebase at clocksource init */
    __u64 raw_time_sec; /* Raw time */
    __u64 raw_time_nsec;
    __u64 xtime_clock_sec;  /* Kernel time */
    __u64 xtime_clock_nsec;
    __u64 xtime_coarse_sec; /* Coarse time */
    __u64 xtime_coarse_nsec;
    __u64 wtm_clock_sec;    /* Wall to monotonic time */
    __u64 wtm_clock_nsec;
    __u32 tb_seq_count; /* Timebase sequence counter */
    /* cs_* members must be adjacent and in this order (ldp accesses) */
    __u32 cs_mono_mult; /* NTP-adjusted clocksource multiplier */
    __u32 cs_shift;     /* Clocksource shift (mono = raw) */
    __u32 cs_raw_mult;  /* Raw clocksource multiplier */
    __u32 tz_minuteswest;   /* Whacky timezone stuff */
    __u32 tz_dsttime;
    __u32 use_syscall; 
    __u32 hrtimer_res;
};

vdso 内核态部分的主要功能是更新 vdso_data 里面的内容,更新的时机包括:某些系统调用被执行时,比如包括:settimeofday()、clock_settime(),或者是某些(与 vdso_data 相关的)内核变量被修改时,比如 jiffies。更新的调用链包括:

  • do_sys_settimeofday64() --> update_vsyscall_tz()
  • timekeeping_update() --> update_vsyscall()

其中,update_vsyscall_tz()、update_vsyscall() 是最终执行更新的函数,是和体系结构相关的,比如 arm64 的定义在 ./arch/arm64/kernel/vdso.c 文件中:

/*  
 * Update the vDSO data page to keep in sync with kernel timekeeping.
 */ 
void update_vsyscall(struct timekeeper *tk)
{   
    u32 use_syscall = !tk->tkr_mono.clock->archdata.vdso_direct;
       
    ++vdso_data->tb_seq_count;
    smp_wmb();                                                                                                                                                                                                                                
       
    vdso_data->use_syscall          = use_syscall;
    vdso_data->xtime_coarse_sec     = tk->xtime_sec;
    vdso_data->xtime_coarse_nsec        = tk->tkr_mono.xtime_nsec >>
                            tk->tkr_mono.shift;
    vdso_data->wtm_clock_sec        = tk->wall_to_monotonic.tv_sec;
    vdso_data->wtm_clock_nsec       = tk->wall_to_monotonic.tv_nsec;
       
    /* Read without the seqlock held by clock_getres() */
    WRITE_ONCE(vdso_data->hrtimer_res, hrtimer_resolution);
       
    if (!use_syscall) {
        /* tkr_mono.cycle_last == tkr_raw.cycle_last */
        vdso_data->cs_cycle_last    = tk->tkr_mono.cycle_last;
        vdso_data->raw_time_sec         = tk->raw_sec;
        vdso_data->raw_time_nsec        = tk->tkr_raw.xtime_nsec;
        vdso_data->xtime_clock_sec  = tk->xtime_sec;
        vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
        vdso_data->cs_mono_mult     = tk->tkr_mono.mult;
        vdso_data->cs_raw_mult      = tk->tkr_raw.mult;
        /* tkr_mono.shift == tkr_raw.shift */
        vdso_data->cs_shift     = tk->tkr_mono.shift;
    }  
       
    smp_wmb();
    ++vdso_data->tb_seq_count;
}   
       
void update_vsyscall_tz(void)
{   
    vdso_data->tz_minuteswest   = sys_tz.tz_minuteswest;
    vdso_data->tz_dsttime       = sys_tz.tz_dsttime;
}   

两个重要的小问题:

1)用户程序如何访问到 vdso 的代码段?
vdso 的基本原理是在应用程序启动时,将内核中的 vdso 代码段和数据页映射到用户进程地址空间中,这件事由 arch_setup_additional_pages() 函数完成,该函数与体系结构相关,对于 arm64 而言,它定义在 ./arch/arm64/kernel/vdso.c 文件中,如下所示:

int arch_setup_additional_pages(struct linux_binprm *bprm,
                int uses_interp)
{ 
    struct mm_struct *mm = current->mm;
    unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
    void *ret;
  
    vdso_text_len = vdso_pages << PAGE_SHIFT;
    /* Be sure to map the data page */
    vdso_mapping_len = vdso_text_len + PAGE_SIZE;
  
    if (down_write_killable(&mm->mmap_sem))
        return -EINTR;
    vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); 
    if (IS_ERR_VALUE(vdso_base)) {
        ret = ERR_PTR(vdso_base);
        goto up_fail;
    }
    ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
                       VM_READ|VM_MAYREAD,
                       &vdso_spec[0]);
    if (IS_ERR(ret))
        goto up_fail;
  
    vdso_base += PAGE_SIZE;
    mm->context.vdso = (void *)vdso_base;
    ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
                       VM_READ|VM_EXEC|
                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
                       &vdso_spec[1]);
    if (IS_ERR(ret))
        goto up_fail;
  
  
    up_write(&mm->mmap_sem);
    return 0;
  
up_fail:
    mm->context.vdso = NULL;
    up_write(&mm->mmap_sem);
    return PTR_ERR(ret);
} 

同时,arch_setup_additional_pages() 函数会被 ./fs/binfmt_elf.c 文件中的 load_elf_binary() 函数调用,该函数用来加载用户提供的 elf 二进制可执行程序,这也证实了前面所说的进程在启动时的映射过程,这部分操作是在内核态完成的。

接着,在 glibc 中会通过 _dl_vdso_vsym() 函数来获取这些 __kernel_xxx() 函数的地址,比如:sysdeps/unix/sysv/linux/aarch64/init-first.c 文件中的 _libc_vdso_platform_setup() 函数:

#include <dl-vdso.h>
#include <libc-vdso.h>

int (*VDSO_SYMBOL(gettimeofday)) (struct timeval *, void *) attribute_hidden;
int (*VDSO_SYMBOL(clock_gettime)) (clockid_t, struct timespec *);
int (*VDSO_SYMBOL(clock_getres)) (clockid_t, struct timespec *);

static inline void
_libc_vdso_platform_setup (void)
{
#ifdef __LP64__
  PREPARE_VERSION (linux_version, "LINUX_2.6.39", 123718537);
#else
  PREPARE_VERSION (linux_version, "LINUX_4.9", 61765625);
#endif

  void *p = _dl_vdso_vsym ("__kernel_gettimeofday", &linux_version);
  PTR_MANGLE (p);
  VDSO_SYMBOL(gettimeofday) = p;

  p = _dl_vdso_vsym ("__kernel_clock_gettime", &linux_version);
  PTR_MANGLE (p);
  VDSO_SYMBOL(clock_gettime) = p;

  p = _dl_vdso_vsym ("__kernel_clock_getres", &linux_version);
  PTR_MANGLE (p);
  VDSO_SYMBOL(clock_getres) = p;
}

#define VDSO_SETUP _libc_vdso_platform_setup

#include <csu/init-first.c>

其中,VDSO_SYMBOL() 宏的定义是:

#define VDSO_SYMBOL(__name) __vdso_##__name

最终,在 glibc 中通过 INLINE_VSYSCALL() 宏进行调用

# define INLINE_VSYSCALL(name, nr, args...) \
({ \
    __label__ out; \
    __label__ iserr; \
    INTERNAL_SYSCALL_DECL (sc_err); \
    long int sc_ret; \
    \
    __typeof (__vdso_##name) vdsop = __vdso_##name; \
    PTR_DEMANGLE (vdsop); \
    if (vdsop != NULL) \
    { \
        sc_ret = INTERNAL_VSYSCALL_CALL (vdsop, sc_err, nr, ##args); \
        if (!INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \
            goto out; \
        if (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err) != ENOSYS) \
            goto iserr; \
    } \
    \
    sc_ret = INTERNAL_SYSCALL (name, sc_err, nr, ##args); \
    if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \
    { \
        iserr: \
        __set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \
        sc_ret = -1L; \
    } \
out: \
    sc_ret; \
})

2)用户程序如何访问到 vdso 的数据页(vdso_data)?

vdso_data 的访问由 vdso 用户态的代码完成,比如 arm64 中的 __kernel_gettimeofday() 函数,它定义在 ./arch/arm64/kernel/vdso/gettimeofday.S 文件中,它是汇编写的,为了理解简单,可以参看 x86 中的 __vdso_gettimeofday() 函数,上面已经说过 __vdso_xxx() 和 __kernel_xxx() 表达的都是 vdso 版的 xxx 系统调用,只是不同的体系结构叫法不一样:

notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
    if (likely(tv != NULL)) {
        if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
            return vdso_fallback_gtod(tv, tz);
        tv->tv_usec /= 1000;
    }
    if (unlikely(tz != NULL)) {
        tz->tz_minuteswest = gtod->tz_minuteswest;
        tz->tz_dsttime = gtod->tz_dsttime;
    }

    return 0;
}

其中 gtod 就是 vdso_data 的指针,在 x86中vdso数据段的初始化及更新和使用 这篇博客里面有讲。

猜你喜欢

转载自blog.csdn.net/choumin/article/details/112862954