实现8086汇编编译器（三）——jmp指令的翻译

文章目录

前言

直接看《汇编语言》书中介绍转移指令的内容：

在这里插入图片描述

这篇文章就来讲一下 jmp 指令的翻译。

jmp 汇编指令的格式

jmp 汇编指令的形式一共有如下几种：

jmp short 标号。段内短转移。
jmp near ptr 标号。段内近转移。
jmp far ptr 标号。段间转移，称为远转移。
jmp 16 位寄存器。
jmp word ptr 内存单元地址。段内转移。
jmp dword ptr 内存单元地址。段间转移。
jmp 标号。我实现的时候将它当做段内近转移。

jmp 机器指令的格式

jmp 机器指令格式如下：

在这里插入图片描述

段内近转移。对应汇编 “jmp near ptr” 标号和 “jmp 标号” 两种形式。

段内短转移。对应汇编“jmp short 标号”的形式。

段内间接转移。对应汇编“jmp 16 位寄存器”和“jmp word ptr 内存单元地址”两种形式。

段间直接转移。对应汇编“jmp far ptr 标号”的形式。

段间间接转移。对应汇编“jmp dword ptr 内存单元地址”的形式。

jmp 指令的翻译

jmp 操作数类型

jmp 汇编指令有这么多格式，首先要做的就是区分操作数的类型。

所以先定义 jmp 操作数类型：

type JmpOperandType uint8

const (
	JmpLabelOperand JmpOperandType = iota
	JmpShortLabelOperand
	JmpNearLabelOperand
	JmpFarLabelOperand
	JmpRegOperand
	JmpWordMemOperand
	JmpDwordMemOperand
	JmpInvalidOperand
)

解析操作数

然后就是解析字符串，区分是哪种操作数类型：

func getJmpOperand(operand string) JmpOperand {
    
    
	fields := strings.Fields(operand)
	len := len(fields)

	switch len {
    
    
	// jmp s1 , jmp ax
	case 1:
		if t, v := isReg16Operand(fields[0]); t {
    
    
			return JmpOperand{
    
    JmpRegOperand, v}
		}

		return JmpOperand{
    
    JmpLabelOperand, fields[0]}
	// jmp short s1
	case 2:
		return JmpOperand{
    
    JmpShortLabelOperand, fields[1]}

	// jmp near ptr s1, jmp far ptr s1, jmp word ptr [bx+si+123], jmp dword ptr [bx+si+123]
	case 3:
		switch fields[0] {
    
    
		case "near":
			return JmpOperand{
    
    JmpNearLabelOperand, fields[2]}
		case "far":
			return JmpOperand{
    
    JmpFarLabelOperand, fields[2]}
		case "word":
			if t, v := isSimpleMemOperand(fields[2]); t {
    
    
				return JmpOperand{
    
    JmpWordMemOperand, v}
			}
			return JmpOperand{
    
    JmpInvalidOperand, nil}
		case "dword":
			if t, v := isSimpleMemOperand(fields[2]); t {
    
    
				return JmpOperand{
    
    JmpDwordMemOperand, v}
			}
			return JmpOperand{
    
    JmpInvalidOperand, nil}

		}
	}

	return JmpOperand{
    
    JmpInvalidOperand, nil}
}

checkJmp 的实现

func checkJmp(stmt []string) (bool, context.Context) {
    
    
	fields := strings.Fields(stmt[1])
	len := len(fields)
	if len > 3 {
    
    
		log.Fatalf("0 invalid \"%s\" syntax", stmt[0])
	}

	if len == 2 {
    
    
		if fields[0] != "short" {
    
    
			log.Fatalf("1 invalid \"%s\" syntax", stmt[0])
		}
	} else if len == 3 {
    
    
		if fields[1] != "ptr" {
    
    
			log.Fatalf("2 invalid \"%s\" syntax", stmt[0])
		}

		if fields[0] != "near" &&
			fields[0] != "far" &&
			fields[0] != "word" &&
			fields[0] != "dword" {
    
    
			log.Fatalf("2 invalid \"%s\" syntax", stmt[0])
		}
	}

	operand := getJmpOperand(stmt[1])
	if operand.Type == JmpInvalidOperand {
    
    
		log.Fatalf("3 invalid \"%s\" syntax", stmt[0])
	}

	var k encodeCtxKey
	ctx := context.Background()
	k = encodeCtxKey("dst")
	ctx = context.WithValue(ctx, k, operand)
	table := map[string]uint8{
    
    
		"jmp":  InstructionJmp,
		"call": InstructionCall,
	}
	k = encodeCtxKey("type")
	ctx = context.WithValue(ctx, k, table[stmt[0]])
	return true, ctx
}

因为 call 指令和 jmp 指令的格式类似，所以这里我是将它们放在一起实现的。所以在 ctx 里面加了个"type" key 用来区分是 jmp 指令还是 call 指令。

call 指令的格式如下：

在这里插入图片描述

encodeJmp 的实现

func encodeJmp(ctx context.Context) []byte {
    
    
	var instruction []byte
	dstOperand := ctx.Value(encodeCtxKey("dst")).(JmpOperand)
	dstOperandType := dstOperand.Type
	instructionType := ctx.Value(encodeCtxKey("type")).(uint8)
	switch dstOperandType {
    
    
	// jmp short s1
	case JmpShortLabelOperand:
		dst := dstOperand.Value.(string)
		instruction = encodeJmpDirectWithinsegmentShort(dst)
	// jmp s1, jmp near ptr s1, call s
	case JmpLabelOperand, JmpNearLabelOperand:
		dst := dstOperand.Value.(string)
		instruction = encodeJmpDirectWithinsegment(instructionType, dst)
	// jmp far ptr s1, call far ptr s
	case JmpFarLabelOperand:
		log.Fatal("jmp not support far ptr")
		dst := dstOperand.Value.(string)
		instruction = encodeJmpDirectIntersegment(dst)
	case JmpRegOperand:
		dst := dstOperand.Value.(*RegOperand)
		instruction = encodeJmpRegIndirectWithinsegment(instructionType, dst)
	case JmpWordMemOperand:
		dst := dstOperand.Value.(*MemOperand)
		instruction = encodeJmpIndirectWithinsegment(instructionType, dst)
	case JmpDwordMemOperand:
		dst := dstOperand.Value.(*MemOperand)
		instruction = encodeJmpIndirectIntersegment(instructionType, dst)
	default:
		log.Fatal("error encodeing jmp")
	}
	return instruction
}

看下段内近转移的编码函数 encodeJmpDirectWithinsegment 的实现：

// jmp near ptr s1, jmp s1, call s
func encodeJmpDirectWithinsegment(instructionType uint8, label string) []byte {
    
    
	/*11101001,IP-INC-LO,IP-INC-HI*/
	var instruction []byte
	if instructionType == InstructionJmp {
    
    
		instruction = append(instruction, 0b11101001)
	} else if instructionType == InstructionCall {
    
    
		instruction = append(instruction, 0b11101000)
	}
	instruction = append(instruction, 0)
	instruction = append(instruction, 0)
	putJmpLabelEncodeInfo(label, 1, 16, 3)
	return instruction
}

注意，jmp 的偏移量翻译成机器指令的时候，它的值是 0。

然后调用 putJmpLabelEncodeInfo(label, 1, 16, 3)，将标号的信息记录。

putJmpLabelEncodeInfo 的实现如下：

func putJmpLabelEncodeInfo(name string, offsetInInstruction uint8, width uint8, instructionLen uint8) {
    
    
	labelEncodeInfos = append(labelEncodeInfos,
		LabelEncodeInfo{
    
    
			Name:       name,
			Offset:     progOffset + uint32(offsetInInstruction),
			Width:      width,
			IsJmpLable: true,
			JmpInc:     codeOffset + uint16(instructionLen),
		})
}

再看下标号信息结构的定义：

var labelEncodeInfos []LabelEncodeInfo

type LabelEncodeInfo struct {
    
    
	Name          string // 标号名称
	Offset        uint32 // 标号在程序中的偏移量
	Width         uint8  // 标号值的宽度
	IsOffsetLabel bool   // 是否是 offset 标号
	IsJmpLable    bool   // 是否是 jmp 指令中的标号
	JmpInc        uint16 // jmp 指令下一条指令在代码段中的偏移量
}

来理解下 putJmpLabelEncodeInfo(label, 1, 16, 3) 四个参数。

第一个参数是标号名称。

第二个参数是 jmp 偏移量在机器指令中的偏移量。

在这里插入图片描述
从格式里看到，表示 jmp 偏移量的 IP-INC-LO,IP-INC-HI 字段在指令中是第2字节开始的，所以 offsetInInstruction 参数是 1。putJmpLabelEncodeInfo 里添加的标号结构里将这个参数值加上 progOffset 变量的值就得到了这个标号在程序中的偏移量！【后续文章介绍它的作用】

第三个参数是 jmp 偏移量的宽度 16 位。

最后一个参数是编码后机器指令的长度，也就是 3 字节。putJmpLabelEncodeInfo 里添加的标号结构里将这个参数值加上 codeOffset 变量的值就得到了这个指令下一条指令相对代码段的偏移量！【后续文章介绍它的作用】

搞定了 mov 和 jmp 指令的翻译，其他指令都是类似的，想要加上什么指令按照这种思路添加就行了。