文章目录

前言
总结
参考资料

前言

这篇文章主要介绍了原子操作API的使用：https://blog.csdn.net/weixin_45030965/article/details/125549728
接下来主要介绍了内核原子操作在x86平台下的原理。

1、原子操作API

x86上用一条带有“lock”前缀的add指令来保证原子变量v加 i 操作的原子性，“lock”前缀在x86上的作用是在执行add指令时独占系统总线，这样即便系统总线上还有其他的master，在add 指令执行期间也无法修改v->counter的值。

/**
 * atomic_add - add integer to atomic variable
 * @i: integer value to add
 * @v: pointer of type atomic_t
 *
 * Atomically adds @i to @v.
 */
static inline void atomic_add(int i, atomic_t *v)
{
    
    
	asm volatile(LOCK_PREFIX "addl %1,%0"
		     : "+m" (v->counter)
		     : "ir" (i));
}

/**
 * atomic_sub - subtract integer from atomic variable
 * @i: integer value to subtract
 * @v: pointer of type atomic_t
 *
 * Atomically subtracts @i from @v.
 */
static inline void atomic_sub(int i, atomic_t *v)
{
    
    
	asm volatile(LOCK_PREFIX "subl %1,%0"
		     : "+m" (v->counter)
		     : "ir" (i));
}

2、LOCK_PREFIX

// arch/x86/include/asm/alternative.h

#ifdef CONFIG_SMP
#define LOCK_PREFIX_HERE \
		".pushsection .smp_locks,\"a\"\n"	\
		".balign 4\n"				\
		".long 671f - .\n" /* offset */		\
		".popsection\n"				\
		"671:"

#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "

#else /* ! CONFIG_SMP */
#define LOCK_PREFIX_HERE ""
#define LOCK_PREFIX ""
#endif

\n：表示换行符
\t：表示将输出位置跳到下一个tab（制表）位置

对于UP单处理器，LOCK_PREFIX宏为空。

对于SMP多处理器，扩展LOCK_PREFIX宏：

.pushsection .smp_locks,"a"	
.balign 4				
.long 671f - .
.popsection
671:
	lock;

（1）
.pushsection .smp_locks,“a” 下面的代码生成到 smp_locks section 中，：
FLAG：A (alloc) allocatable

readelf -S acpi_pad.ko  //查看一个模块的section headers信息

在这里插入图片描述
smp_locks section 就是该模块代码段中所有 lock指令的信息。

（2）
balign 4 : 四字节对齐

（3）
.long 671f - . 将671 label 的地址置于.smp_locks section中，而 label 671的地址即为：代码段lock指令的地址。（其实就是lock指令的指针）

（4）

671:
	lock;

671lable ：lock指针的地址。开始生成lock 前缀的指令。
上面已经说明将671 label 的地址置于.smp_locks section中，也就是将lock指针的地址置于.smp_locks section中。

（5）
这段汇编代码在 .text 段生成一条 lock 指令前缀 0xf0（LOCK指令的操作码是0xF0），在 .smp_locks section 生成四个字节的 lock 前缀的地址，链接的时候，所有的 .smp_locks section合并起来，形成一个所有 lock 指令地址的数组，这样统计 .smp_locks section 就能知道代码里有多少个加锁的指令被生成。

（6）
常见的锁前缀在一个单独的表中作为特殊情况处理，这个表是一个纯地址列表，没有替换的ptr和大小信息。这样可以使表的大小保持较小。也就是将text中将lock指针的地址置于.smp_locks section中。

这样我们就可以从 smp_locks section中知道 text 代码中所有带有 lock 指令前缀信息了。

3、源码分析

3.1 module_finalize

module_finalize是一个与体系架构相关的函数，允许不同体系架构的实现执行特定于系统的结束工作。简单点来说就是模块加载的结束时调用的函数。

// /arch/x86/kernel/module.c

int module_finalize(const Elf_Ehdr *hdr,
		    const Elf_Shdr *sechdrs,
		    struct module *me)
{
    
    
	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
		*para = NULL;
	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;

	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
    
    
		if (!strcmp(".text", secstrings + s->sh_name))
			text = s;
		if (!strcmp(".altinstructions", secstrings + s->sh_name))
			alt = s;
		if (!strcmp(".smp_locks", secstrings + s->sh_name))
			locks = s;
		if (!strcmp(".parainstructions", secstrings + s->sh_name))
			para = s;
	}

	if (alt) {
    
    
		/* patch .altinstructions */
		void *aseg = (void *)alt->sh_addr;
		apply_alternatives(aseg, aseg + alt->sh_size);
	}
	if (locks && text) {
    
    
		void *lseg = (void *)locks->sh_addr;
		void *tseg = (void *)text->sh_addr;
		alternatives_smp_module_add(me, me->name,
					    lseg, lseg + locks->sh_size,
					    tseg, tseg + text->sh_size);
	}

	if (para) {
    
    
		void *pseg = (void *)para->sh_addr;
		apply_paravirt(pseg, pseg + para->sh_size);
	}

	/* make jump label nops */
	jump_label_apply_nops(me);

	return 0;
}

3.2 alternatives_smp_module_add

对于上面这段代码我们重点关注这部分，如果模块有.text 和 .smp_locks section 就调用 alternatives_smp_module_add 函数。

	（1）
	if (locks && text) {
    
    
		void *lseg = (void *)locks->sh_addr;
		void *tseg = (void *)text->sh_addr;
		alternatives_smp_module_add(me, me->name,
					    lseg, lseg + locks->sh_size,
					    tseg, tseg + text->sh_size);
	}

	（2）
	/* make jump label nops */
	jump_label_apply_nops(me);

// arch/x86/kernel/alternative.c

#ifdef CONFIG_SMP
struct smp_alt_module {
    
    
	/* what is this ??? */
	struct module	*mod;
	char		*name;

	/* ptrs to lock prefixes */
	const s32	*locks;
	const s32	*locks_end;

	/* .text segment, needed to avoid patching init code ;) */
	u8		*text;
	u8		*text_end;

	struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_MUTEX(smp_alt);
static bool uniproc_patched = false;	/* protected by smp_alt */

void __init_or_module alternatives_smp_module_add(struct module *mod,
						  char *name,
						  void *locks, void *locks_end,
						  void *text,  void *text_end)
{
    
    
	struct smp_alt_module *smp;

	mutex_lock(&smp_alt);
	if (!uniproc_patched)
		goto unlock;

	if (num_possible_cpus() == 1)
		/* Don't bother remembering, we'll never have to undo it. */
		goto smp_unlock;

	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
	if (NULL == smp)
		/* we'll run the (safe but slow) SMP code then ... */
		goto unlock;

	smp->mod	= mod;
	smp->name	= name;
	smp->locks	= locks;
	smp->locks_end	= locks_end;
	smp->text	= text;
	smp->text_end	= text_end;
	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
		__func__, smp->locks, smp->locks_end,
		smp->text, smp->text_end, smp->name);

	list_add_tail(&smp->next, &smp_alt_modules);
smp_unlock:
	alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
	mutex_unlock(&smp_alt);
}

#endif	/* CONFIG_SMP */

如果是多处理器：

list_add_tail(&smp->next, &smp_alt_modules);

如果是单处理器，将锁前缀转换为DS段覆盖前缀：

if (num_possible_cpus() == 1)
		/* Don't bother remembering, we'll never have to undo it. */
		goto smp_unlock;
		
smp_unlock:
	alternatives_smp_unlock(locks, locks_end, text, text_end);

static void alternatives_smp_unlock(const s32 *start, const s32 *end,
				    u8 *text, u8 *text_end)
{
    
    
	const s32 *poff;

	mutex_lock(&text_mutex);
	for (poff = start; poff < end; poff++) {
    
    
		u8 *ptr = (u8 *)poff + *poff;

		if (!*poff || ptr < text || ptr >= text_end)
			continue;
		/* turn lock prefix into DS segment override prefix */
		if (*ptr == 0xf0)
			text_poke(ptr, ((unsigned char []){
    
    0x3E}), 1);
	}
	mutex_unlock(&text_mutex);
}

// /arch/x86/include/asm\nops.h

#define NOP_DS_PREFIX 0x3e

 0xf0 -> 0x3E :把 lock prefix 换成 DS  override prefix

从函数名我们就可以知道，如果是单处理器，就将加锁前缀的指令解锁。即：即使内核配置了 smp，但是实际运行到单处理器上时，通过运行期间打补丁，根据 .smp_locks 里的记录，把 lock 指令前缀替换成 DS override prefix（nop指令）以消除指令加锁的开销。

相对应有一个加锁的函数：

static void alternatives_smp_lock(const s32 *start, const s32 *end,
				  u8 *text, u8 *text_end)
{
    
    
	const s32 *poff;

	mutex_lock(&text_mutex);
	for (poff = start; poff < end; poff++) {
    
    
		u8 *ptr = (u8 *)poff + *poff;

		if (!*poff || ptr < text || ptr >= text_end)
			continue;
		/* turn DS segment override prefix into lock prefix */
		if (*ptr == 0x3e)
			text_poke(ptr, ((unsigned char []){
    
    0xf0}), 1);
	}
	mutex_unlock(&text_mutex);
}

把 DS segment override prefix 替换成 lock prefix。

Instruction Prefixes
指令前缀分为四组，每组有一组可允许的前缀码。对于每条指令，只需要从四组(组1、2、3、4)中的每一组中包含一个前缀码就可以了。这里我只关注 LOCK prefix （F0H）和 DS segment override prefix（3EH）。
更多信息可参考：Intel 手册 2.1.1 Instruction Prefixes。

• Group 1
	 Lock and repeat prefixes:
	 • LOCK prefix is encoded using F0H.

• Group 2
	— Segment override prefixes:
	• 3EH—DS segment override prefix (use with any branch instruction is reserved).

3.3 jump_label_apply_nops

遍历该模块的所有jump_entry条目，传递参数：JUMP_LABEL_DISABLE。将模块的jump_entry条目填充nop空字节。
在这里插入图片描述

 W (write), A (alloc)

// /kernel/jump_label.c

/***
 * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
 * @mod: module to patch
 *
 * Allow for run-time selection of the optimal nops. Before the module
 * loads patch these with arch_get_jump_label_nop(), which is specified by
 * the arch specific jump label code.
 */
void jump_label_apply_nops(struct module *mod)
{
    
    
	struct jump_entry *iter_start = mod->jump_entries;
	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
	struct jump_entry *iter;

	/* if the module doesn't have jump label entries, just return */
	if (iter_start == iter_stop)
		return;

	//遍历该模块的所有jump_entry条目，注意这里传递的参数是JUMP_LABEL_DISABLE
	for (iter = iter_start; iter < iter_stop; iter++) {
    
    
		arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
	}
}

// arch/x86/include/asm/jump_label.h

#ifdef CONFIG_X86_64
typedef u64 jump_label_t;
#else
typedef u32 jump_label_t;
#endif

struct jump_entry {
    
    
	jump_label_t code;
	jump_label_t target;
	jump_label_t key;
};

// /include/linux/jump_label.h
enum jump_label_type {
    
    
	JUMP_LABEL_DISABLE = 0,
	JUMP_LABEL_ENABLE,
};

// /include/linux/module.h

struct module
{
    
    
......
#ifdef HAVE_JUMP_LABEL
	struct jump_entry *jump_entries;
	unsigned int num_jump_entries;
#endif

......

/* 
 * Update code which is definitely not currently executing.
 * Architectures which need heavyweight synchronization to modify
 * running code can override this to make the non-live update case
 * cheaper.
 */
void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
					    enum jump_label_type type)
{
    
    
	arch_jump_label_transform(entry, type);	
}

// /arch/x86/kernel/jump_label.c

#define JUMP_LABEL_NOP_SIZE 5

#define ASM_NOP_MAX 8
#define NOP_ATOMIC5 (ASM_NOP_MAX+1)	/* Entry for the 5-byte atomic NOP */

union jump_code_union {
    
    
	char code[JUMP_LABEL_NOP_SIZE];
	struct {
    
    
		char jump;
		int offset;
	} __attribute__((packed));
};

// 由于传递的参数是JUMP_LABEL_DISABLE，模块的jump_entry条目用nop指令替代
static void __jump_label_transform(struct jump_entry *entry,
				   enum jump_label_type type,
				   void *(*poker)(void *, const void *, size_t))
{
    
    
	union jump_code_union code;

	//如果type == JUMP_LABEL_ENABLE，jump_entry是jump指令
	if (type == JUMP_LABEL_ENABLE) {
    
    
		code.jump = 0xe9;
		code.offset = entry->target -
				(entry->code + JUMP_LABEL_NOP_SIZE);
				
	//如果type == JUMP_LABEL_DISABLE，jump_entry是nop指令
	} else
		memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);

	//替换模块jump_entry的code成员
	(*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
}

void arch_jump_label_transform(struct jump_entry *entry,
			       enum jump_label_type type)
{
    
    
	get_online_cpus();
	mutex_lock(&text_mutex);
	__jump_label_transform(entry, type, text_poke_smp);
	mutex_unlock(&text_mutex);
	put_online_cpus();
}

// /arch/x86/kernelalternative.c

/**
 * text_poke_smp - Update instructions on a live kernel on SMP
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
 * should be allowed, since stop_machine() does _not_ protect code against
 * NMI and MCE.
 *
 * Note: Must be called under get_online_cpus() and text_mutex.
 */
void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
{
    
    
	struct text_poke_params tpp;
	struct text_poke_param p;

	p.addr = addr;
	p.opcode = opcode;
	p.len = len;
	tpp.params = &p;
	tpp.nparams = 1;
	atomic_set(&stop_machine_first, 1);
	wrote_text = 0;
	/* Use __stop_machine() because the caller already got online_cpus. */
	__stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
	return addr;
}

4、LOCK指令

在这里插入图片描述
asserted：可以理解为发出信号。

使处理器的 LOCK# 信号在伴随指令的执行期间 be asserted（将指令转换为原子指令）。在多处理器环境中，LOCK# 信号确保处理器在信号被 asserted 时独占使用任何共享内存。

X86 CPU 上都具有锁定一个特定内存地址的能力，当这个特定内存地址被锁定后，它就可以阻止其他的系统总线读取或修改这个内存地址。这种能力是通过 LOCK 指令前缀再加上下面的汇编指令来实现的。当使用 LOCK 指令前缀时，它会使 CPU 发送一个 LOCK# 信号，这样就能确保在多处理器系统或多线程竞争的环境下互斥地使用这个内存地址。当指令执行完毕，这个锁定动作也就会消失。

LOCK 前缀只能添加到以下指令，并且只能添加到目标操作数是内存操作数的那些指令形式：ADD、ADC、AND、BTC、BTR、BTS、CMPXCHG、CMPXCH8B、CMPXCHG16B、DEC、INC、 NEG、NOT、OR、SBB、SUB、XOR、XADD 和 XCHG。如果LOCK前缀与这些指令中的一个一起使用，并且源操作数是内存操作数，则可能会生成一个未定义的操作码异常(#UD)。如果 LOCK 前缀与任何不在上述列表中的指令一起使用，也会产生未定义的操作码异常。无论是否存在 LOCK 前缀，XCHG 指令始终 assert LOCK# 信号。

LOCK 前缀通常与 BTS 指令一起使用，以对共享内存环境中的内存位置执行读-修改-写操作。

LOCK前缀的完整性不受内存字段对齐的影响。内存锁定会观察到任意错位的字段。

5、LOCK 操作对内部处理器缓存的影响

在大多数IA-32和所有Intel 64处理器中，锁可以在没有LOCK#信号 being asserted 的情况下发生：

对于IA-32 Architecture Compatibility，从 P6 系列处理器开始，当 LOCK 前缀作为指令和内存区域的前缀时被访问在处理器内部被缓存，LOCK#信号通常不被asserted。相反，只有处理器的缓存被锁定。在这里，处理器的缓存一致性机制确保操作在内存方面以原子方式执行。
如果在 LOCK 操作期间被锁定的内存区域作为 write-back 内存缓存在执行 LOCK 操作的处理器中，并且完全包含在 cache line中，则处理器可能不会在总线上 assert LOCK# 信号。相反，它将在内部修改内存位置并允许其缓存一致性机制以确保操作以原子方式执行。此操作称为“cache locking”。缓存一致性机制自动防止缓存了相同内存区域的两个或多个处理器同时修改该区域中的数据。
更多信息请参考Volume 3A- Chapter 8-8.1 Locked Atomic Operations
https://blog.csdn.net/weixin_45030965/article/details/125709626

备注：简单来说，以 addl 指令为例子，就是x86处理器上用一条带有“lock”前缀的addl指令来保证原子变量v加i操作的原子性，“lock”前缀在x86上的作用是在执行 addl 指令时独占系统总线，这样即便系统总线上还有其他的master，在 addl 执行期间也无法修改v->counter的值。
x86处理器带“lock”前缀的指令（只能是上述列出的指令）能保证其原子性。

6、例子说明

硬件级的原子操作：在单处理器系统（UniProcessor）中，能够在单条指令中完成的操作都可以认为是“原子操作”，因为中断只发生在指令边缘。在多处理器结构中就不同了，由于系统中有多个处理器独立运行，即使能在单条指令中完成的操作也有可能受到干扰。在X86平台上，CPU提供了在指令执行期间对总线加锁的手段。CPU上有一根引线#HLOCK pin连到北桥，如果汇编语言的程序中在一条指令前面加上前缀"LOCK"，经过汇编以后的机器代码就使CPU在执行这条指令的时候把#HLOCK pin的电位拉低，持续到这条指令结束时放开，从而把总线锁住，这样同一总线上别的CPU就暂时不能通过总线访问内存了，保证了这条指令在多处理器环境中的原子性。

LOCK前缀作用于单个指令上，它对中断没有任何影响，因为中断只能在指令之间产生。LOCK前缀的真正作用是保持对系统总线的控制，直到整条指令执行完毕。它在一条指令多次访问内存的时候相当有用。
比如一个简单的共享资源计数器，我们需要对它进行原子递增操作，需要做如下工作：

1）从内存读取该计数器的值，临时将其保存在CPU内部寄存器中。
2）在寄存器中将读取到的值加1。
3）将被修改后的值写回内存。

在x86体系结构中，这个递增操作可以在单个指令中完成，因此中断不会对该递增操作产生影响。但是该指令有两次内存访问操作，读和写，另外一个CPU可能同时对该计数器进行递增操作。如果另外一个CPU在第1步完成后，第3步完成前读取该计数器的值，那么两个CPU都使用被修改之前的计数器值并对其进行递增操作。这样就出现了错误的情况。

如果在此时使用了LOCK前缀，一个CPU在对该计数器进行操作的时候，保持对总线的控制权，直到递增操作完毕，也就是在这期间，其它的CPU不能访问该变量，直到该CPU完成所有操作为止。

备注：单处理器系统中，单条指令是“原子操作”。多处理器系统，单条指令的操作执行也会被其它CPU打断，因此在多处理器系统中单条指令并不是原子操作。在x86系统，通过在单条指令加上前缀"LOCK"，保证了这条指令在多处理器环境中的原子性。

总结

以上就是x86处理器原子操作的原理。

参考资料

Linux内核源码 3.10.0

https://blog.csdn.net/vividonly/article/details/6599502
https://zhuanlan.zhihu.com/p/115355303
https://blog.csdn.net/zacklin/article/details/7445442
https://www.cnblogs.com/biyeymyhjob/archive/2012/07/20/2600972.html

linux同步之原子操作（二）