首先,借用一张图来说明 vdso 的原理,下面是描述 vdso 在 arm64 上的工作原理:
总的来说,vdso 分为用户态部分和内核态部分。
vdso 用户态部分提供 __vdso_xxx() 接口(或者是 __kernel_xxx() 接口,如上图所示),这些接口由glibc调用并将结果返回给用户程序,它们的实现也比较简单,就是读取 vdso_data 里面的内容并进行简单处理,然后将结果返回给 glibc。vdso_data 是一个结构体,这个数据结构也是跟体系结构相关的,下面是 arm64 的例子,它定义在 arch/arm64/include/asm/vdso_datapage.h 文件中:
struct vdso_data {
__u64 cs_cycle_last; /* Timebase at clocksource init */
__u64 raw_time_sec; /* Raw time */
__u64 raw_time_nsec;
__u64 xtime_clock_sec; /* Kernel time */
__u64 xtime_clock_nsec;
__u64 xtime_coarse_sec; /* Coarse time */
__u64 xtime_coarse_nsec;
__u64 wtm_clock_sec; /* Wall to monotonic time */
__u64 wtm_clock_nsec;
__u32 tb_seq_count; /* Timebase sequence counter */
/* cs_* members must be adjacent and in this order (ldp accesses) */
__u32 cs_mono_mult; /* NTP-adjusted clocksource multiplier */
__u32 cs_shift; /* Clocksource shift (mono = raw) */
__u32 cs_raw_mult; /* Raw clocksource multiplier */
__u32 tz_minuteswest; /* Whacky timezone stuff */
__u32 tz_dsttime;
__u32 use_syscall;
__u32 hrtimer_res;
};
vdso 内核态部分的主要功能是更新 vdso_data 里面的内容,更新的时机包括:某些系统调用被执行时,比如包括:settimeofday()、clock_settime(),或者是某些(与 vdso_data 相关的)内核变量被修改时,比如 jiffies。更新的调用链包括:
- do_sys_settimeofday64() --> update_vsyscall_tz()
- timekeeping_update() --> update_vsyscall()
其中,update_vsyscall_tz()、update_vsyscall() 是最终执行更新的函数,是和体系结构相关的,比如 arm64 的定义在 ./arch/arm64/kernel/vdso.c 文件中:
/*
* Update the vDSO data page to keep in sync with kernel timekeeping.
*/
void update_vsyscall(struct timekeeper *tk)
{
u32 use_syscall = !tk->tkr_mono.clock->archdata.vdso_direct;
++vdso_data->tb_seq_count;
smp_wmb();
vdso_data->use_syscall = use_syscall;
vdso_data->xtime_coarse_sec = tk->xtime_sec;
vdso_data->xtime_coarse_nsec = tk->tkr_mono.xtime_nsec >>
tk->tkr_mono.shift;
vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec;
vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec;
/* Read without the seqlock held by clock_getres() */
WRITE_ONCE(vdso_data->hrtimer_res, hrtimer_resolution);
if (!use_syscall) {
/* tkr_mono.cycle_last == tkr_raw.cycle_last */
vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last;
vdso_data->raw_time_sec = tk->raw_sec;
vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec;
vdso_data->xtime_clock_sec = tk->xtime_sec;
vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
vdso_data->cs_mono_mult = tk->tkr_mono.mult;
vdso_data->cs_raw_mult = tk->tkr_raw.mult;
/* tkr_mono.shift == tkr_raw.shift */
vdso_data->cs_shift = tk->tkr_mono.shift;
}
smp_wmb();
++vdso_data->tb_seq_count;
}
void update_vsyscall_tz(void)
{
vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
vdso_data->tz_dsttime = sys_tz.tz_dsttime;
}
两个重要的小问题:
1)用户程序如何访问到 vdso 的代码段?
vdso 的基本原理是在应用程序启动时,将内核中的 vdso 代码段和数据页映射到用户进程地址空间中,这件事由 arch_setup_additional_pages() 函数完成,该函数与体系结构相关,对于 arm64 而言,它定义在 ./arch/arm64/kernel/vdso.c 文件中,如下所示:
int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp)
{
struct mm_struct *mm = current->mm;
unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
void *ret;
vdso_text_len = vdso_pages << PAGE_SHIFT;
/* Be sure to map the data page */
vdso_mapping_len = vdso_text_len + PAGE_SIZE;
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
if (IS_ERR_VALUE(vdso_base)) {
ret = ERR_PTR(vdso_base);
goto up_fail;
}
ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
VM_READ|VM_MAYREAD,
&vdso_spec[0]);
if (IS_ERR(ret))
goto up_fail;
vdso_base += PAGE_SIZE;
mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
&vdso_spec[1]);
if (IS_ERR(ret))
goto up_fail;
up_write(&mm->mmap_sem);
return 0;
up_fail:
mm->context.vdso = NULL;
up_write(&mm->mmap_sem);
return PTR_ERR(ret);
}
同时,arch_setup_additional_pages() 函数会被 ./fs/binfmt_elf.c 文件中的 load_elf_binary() 函数调用,该函数用来加载用户提供的 elf 二进制可执行程序,这也证实了前面所说的进程在启动时的映射过程,这部分操作是在内核态完成的。
接着,在 glibc 中会通过 _dl_vdso_vsym() 函数来获取这些 __kernel_xxx() 函数的地址,比如:sysdeps/unix/sysv/linux/aarch64/init-first.c 文件中的 _libc_vdso_platform_setup() 函数:
#include <dl-vdso.h>
#include <libc-vdso.h>
int (*VDSO_SYMBOL(gettimeofday)) (struct timeval *, void *) attribute_hidden;
int (*VDSO_SYMBOL(clock_gettime)) (clockid_t, struct timespec *);
int (*VDSO_SYMBOL(clock_getres)) (clockid_t, struct timespec *);
static inline void
_libc_vdso_platform_setup (void)
{
#ifdef __LP64__
PREPARE_VERSION (linux_version, "LINUX_2.6.39", 123718537);
#else
PREPARE_VERSION (linux_version, "LINUX_4.9", 61765625);
#endif
void *p = _dl_vdso_vsym ("__kernel_gettimeofday", &linux_version);
PTR_MANGLE (p);
VDSO_SYMBOL(gettimeofday) = p;
p = _dl_vdso_vsym ("__kernel_clock_gettime", &linux_version);
PTR_MANGLE (p);
VDSO_SYMBOL(clock_gettime) = p;
p = _dl_vdso_vsym ("__kernel_clock_getres", &linux_version);
PTR_MANGLE (p);
VDSO_SYMBOL(clock_getres) = p;
}
#define VDSO_SETUP _libc_vdso_platform_setup
#include <csu/init-first.c>
其中,VDSO_SYMBOL() 宏的定义是:
#define VDSO_SYMBOL(__name) __vdso_##__name
最终,在 glibc 中通过 INLINE_VSYSCALL() 宏进行调用
# define INLINE_VSYSCALL(name, nr, args...) \
({ \
__label__ out; \
__label__ iserr; \
INTERNAL_SYSCALL_DECL (sc_err); \
long int sc_ret; \
\
__typeof (__vdso_##name) vdsop = __vdso_##name; \
PTR_DEMANGLE (vdsop); \
if (vdsop != NULL) \
{ \
sc_ret = INTERNAL_VSYSCALL_CALL (vdsop, sc_err, nr, ##args); \
if (!INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \
goto out; \
if (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err) != ENOSYS) \
goto iserr; \
} \
\
sc_ret = INTERNAL_SYSCALL (name, sc_err, nr, ##args); \
if (INTERNAL_SYSCALL_ERROR_P (sc_ret, sc_err)) \
{ \
iserr: \
__set_errno (INTERNAL_SYSCALL_ERRNO (sc_ret, sc_err)); \
sc_ret = -1L; \
} \
out: \
sc_ret; \
})
2)用户程序如何访问到 vdso 的数据页(vdso_data)?
vdso_data 的访问由 vdso 用户态的代码完成,比如 arm64 中的 __kernel_gettimeofday() 函数,它定义在 ./arch/arm64/kernel/vdso/gettimeofday.S 文件中,它是汇编写的,为了理解简单,可以参看 x86 中的 __vdso_gettimeofday() 函数,上面已经说过 __vdso_xxx() 和 __kernel_xxx() 表达的都是 vdso 版的 xxx 系统调用,只是不同的体系结构叫法不一样:
notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
{
if (likely(tv != NULL)) {
if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE))
return vdso_fallback_gtod(tv, tz);
tv->tv_usec /= 1000;
}
if (unlikely(tz != NULL)) {
tz->tz_minuteswest = gtod->tz_minuteswest;
tz->tz_dsttime = gtod->tz_dsttime;
}
return 0;
}
其中 gtod 就是 vdso_data 的指针,在 x86中vdso数据段的初始化及更新和使用 这篇博客里面有讲。