Alternative macro in the kernel

Through the alternative() macro, the kernel can realize online optimization of the kernel code by judging whether the current CPU supports certain features at runtime (rewriting some kernel instructions online without shutting down or changing the kernel) to achieve acceleration For the purpose of kernel execution, the following takes x86 as an example to describe its approximate implementation process.
1) Alternative() macro definition

The ALTERNATIVE() macro is defined in arch/x86/include/asm/alternative-asm.h, as shown below:

.macro ALTERNATIVE oldinstr, newinstr, feature
140:
    \oldinstr
141:
    .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
142:
    
    .pushsection .altinstructions,"a"
    altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
    .popsection
    
    .pushsection .altinstr_replacement,"ax"
143:
    \newinstr
144:
    .popsection
.endm

Since the instructions are rewritten according to the feature, the oldinstr and newinstr are needed here, and the feature
ALTERNATIVE() macro mainly does the following two tasks:
a. Put an instance of struct alt_instr into the .altinstructions section of the vmlinux ELF file, the structure The necessary information on which the instruction rewrite depends is stored, as shown below:

struct alt_instr {
    s32 instr_offset;   /* original instruction */
    s32 repl_offset;    /* offset to replacement instruction */
    u16 cpuid;      /* cpuid bit set for replacement */
    u8  instrlen;       /* length of original instruction */
    u8  replacementlen; /* length of new instruction */
    u8  padlen;     /* length of build-time padding */
} __packed;

b. Put the above newinstr into the .altinstr_replacement section

2) alternative() macro call

Define alternative_input() macro

#define alternative_input(oldinstr, newinstr, feature, input...)    \
    asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature)   \
        : : "i" (0), ## input)

Call alternative_input() in the ./arch/x86/include/asm/processor.h file

alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, "m" (*(const char *)x));

3) Online rewriting instructions The
apply_alternatives() function is called in the arch/x86/kernel/alternative.c file, which implements online rewriting of instructions:

apply_alternatives(__alt_instructions, __alt_instructions_end);

Among them, __alt_instructions and __alt_instructions_end are the start and end positions of the .altinstructions section respectively, and their reference method is:

extern struct alt_instr __alt_instructions[], __alt_instructions_end[];

They are defined in arch/x86/kernel/vmlinux.lds.S, as shown below:

/*  
 * struct alt_inst entries. From the header (alternative.h):
 * "Alternative instructions for different CPU types or capabilities"
 * Think locking instructions on spinlocks.
 */
 . = ALIGN(8);
 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
     __alt_instructions = .;
     *(.altinstructions)
     __alt_instructions_end = .;
  }

The definition of apply_alternatives() is as follows. The general process is to traverse all struct alt_instr instances in the .altinstructions section and process them separately:

void __init_or_module noinline apply_alternatives(struct alt_instr *start,
                          struct alt_instr *end)
{
    struct alt_instr *a;
    u8 *instr, *replacement;
    u8 insn_buff[MAX_PATCH_LEN];
 
    DPRINTK("alt table %px, -> %px", start, end);
    /*   
     * The scan order should be from start to end. A later scanned
     * alternative code can overwrite previously scanned alternative code.
     * Some kernel functions (e.g. memcpy, memset, etc) use this order to
     * patch code.
     *
     * So be careful if you want to change the scan order to any other
     * order.
     */
    for (a = start; a < end; a++) {
        int insn_buff_sz = 0; 
 
        instr = (u8 *)&a->instr_offset + a->instr_offset;
        replacement = (u8 *)&a->repl_offset + a->repl_offset;
        BUG_ON(a->instrlen > sizeof(insn_buff));
        BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 
        if (!boot_cpu_has(a->cpuid)) {
            if (a->padlen > 1) 
                optimize_nops(a, instr);
 
            continue;
        }    
 
        DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
            a->cpuid >> 5,
            a->cpuid & 0x1f,
            instr, instr, a->instrlen,
            replacement, a->replacementlen, a->padlen);
 
        DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
        DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
 
        memcpy(insn_buff, replacement, a->replacementlen);
        insn_buff_sz = a->replacementlen;
        
        /*   
         * 0xe8 is a relative jump; fix the offset.
         *   
         * Instruction length is checked before the opcode to avoid
         * accessing uninitialized bytes for zero-length replacements.
         */  
        if (a->replacementlen == 5 && *insn_buff == 0xe8) {
            *(s32 *)(insn_buff + 1) += replacement - instr;
            DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
                *(s32 *)(insn_buff + 1),
                (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
        }    
             
        if (a->replacementlen && is_jmp(replacement[0]))
            recompute_jump(a, instr, replacement, insn_buff);
             
        if (a->instrlen > a->replacementlen) {
            add_nops(insn_buff + a->replacementlen,
                 a->instrlen - a->replacementlen);
            insn_buff_sz += a->instrlen - a->replacementlen;
        }    
        DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
             
        text_poke_early(instr, insn_buff, insn_buff_sz);
    }        
}            

 

Guess you like

Origin blog.csdn.net/choumin/article/details/115108813