实现8086汇编编译器（四）——生成可执行程序

文章目录

添加标号信息

还是将两个示例程序再贴出来。

示例程序1：

assume cs:code,ds:data,ss:stack     ;将cs,ds,ss分别和code,data,stack段相连
data segment
  dw 0123h, 0456h, 0789h, 0abch, 0defh, 0fedh, 0cbah, 0987h
data ends

stack segment
  dw 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
stack ends
code segment
  start: mov ax,stack
         mov ss,ax
         mov sp,20h         ; 将设置栈顶ss:sp指向stack:20

         mov ax, data        ; 将名称为"data"的段的段地址送入ax
         mov ds,ax          ; ds指向data段

         mov bx,0           ; ds:bx指向data段中的第一个单元
         mov cx,8

    s0: push cs:[bx]
        add bx,2
        loop s0             ; 以上将代码段0~15单元总的8个字型数据依次入栈

        mov bx,0
        mov cx, 8
    s1:pop cs:[bx]
        add bx,2
        loop s1             ; 以上依次出栈8个字型数据到代码段0~15单元中

  mov ax,4c00h
  int 21h
code ends
end start

示例程序2：

assume cs:code
code segment
      mov ax,4c00h
      int 21h
  start: mov ax,0
      s: nop
         nop
         
         mov di,offset s
         mov si,offset s2
         mov ax,cs:[si]
         mov cs:[di],ax

      s0: jmp short s

      s1: mov ax,0
          int 21h
          mov ax,0

      s2: jmp short s1
          nop
code ends
end start

当实现了 mov、add、jmp、push、pop、loop、int、nop 这几个指令的翻译之后，就可以将上面两个示例程序翻译成由机器指令组成的二进制程序了。

但是程序中的标号目前还没有处理。在前两篇文章介绍的翻译过程中，标号对应的机器码格式中的值都为0，我们只是记录了标号在程序中的信息。

比如第一个程序中：

  start: mov ax,stack
         mov ss,ax
         
         mov ax, data
         mov ds,ax

stack 标号的值是多少？ data 标号的值是多少？

比如第二个程序中：

         mov di,offset s
         mov si,offset s2

offset s 和 offset s2 的值多少？

      s0: jmp short s
      s2: jmp short s1

两条 jmp 指令中到标号 s 和 s1 的偏移量是多少？

所以我们必须对程序中这些机器指令中的值进行修正。

具体来说，以第一个程序片段：

  start: mov ax,stack
         mov ss,ax
         
         mov ax, data
         mov ds,ax

中的 ‘mov ax, stack’ 指令来说，在翻译时程序识别到是 mov 立即数到寄存器，所以会调编码函数 encodeMovImmediateToReg 来处理：

func encodeMovImmediateToReg(src *ImmediateOperand, dst *RegOperand) []byte {
    
    
	/*1011 w reg, data [data] */
	var instruction []byte
	var w uint8
	var reg uint8
	// 宽度以寄存器为准
	if dst.Width == 8 {
    
    
		w = 0
		reg = reg8BitMap[dst.Name]
	} else {
    
    
		w = 1
		reg = reg16BitMap[dst.Name]
	}

	instruction = append(instruction, 0b10110000|w<<3|reg)
	if src.IsLabel || src.IsLabelOffset {
    
    
		putLabelEncodeInfo(src.Label, uint8(len(instruction)), dst.Width, src.IsLabelOffset)
	}
	instruction = append(instruction, byte(src.Value))
	if dst.Width == 16 {
    
    
		instruction = append(instruction, byte((src.Value>>8)&0xff))
	}
	return instruction
}

由于立即数‘stack’是个标号，所以调用 putLabelEncodeInfo 将标号信息记录：

func putLabelEncodeInfo(name string, offsetInInstruction uint8, width uint8, isOffsetLabel bool) {
    
    
	labelEncodeInfos = append(labelEncodeInfos,
		LabelEncodeInfo{
    
    
			Name:          name,
			Offset:        progOffset + uint32(offsetInInstruction),
			Width:         width,
			IsOffsetLabel: isOffsetLabel,
		})
}

这样我们就知道这个标号在程序中的偏移量，它的宽度。

修正程序中的标号值

标号的信息记录在翻译过程中记录在 labelEncodeInfos 表中：

var labelEncodeInfos []LabelEncodeInfo

type LabelEncodeInfo struct {
    
    
	Name          string // 标号名称
	Offset        uint32 // 标号在程序中的偏移量
	Width         uint8  // 标号值的宽度
	IsOffsetLabel bool   // 是否是 offset 标号
	IsJmpLable    bool   // 是否是 jmp 指令中的标号
	JmpInc        uint16 // jmp 指令下一条指令在代码段中的偏移量
}

所以在扫描到 “ends start” 【标识程序结束】后，调用 processLabelEncodeInfo 函数对程序中标号的值进行修正：

func processLabelEncodeInfo() {
    
    
	fmt.Println("len(labelEncodeInfos)=", len(labelEncodeInfos))
	for _, info := range labelEncodeInfos {
    
    
		if info.IsOffsetLabel || info.IsJmpLable {
    
    
			//look up label map
			v, ok := labelMap[info.Name]
			if !ok {
    
    
				log.Fatalf("cant't find inner label %s in label table", info.Name)
			}

			if info.IsOffsetLabel {
    
     // offset 标号
				offset := v
				fmt.Printf("offset label: \"%s\" offset %d, value %d\n", info.Name, info.Offset, offset)
				program[info.Offset] = byte(offset & 0xff)
				if info.Width == 16 {
    
    
					program[info.Offset+1] = byte((offset >> 8) & 0xff)
				}
			} else {
    
     // jmp 标号
				var jmpInc int16 = int16(v - info.JmpInc)
				fmt.Printf("jmp label: \"%s\" offset %d, value %d\n", info.Name, info.Offset, jmpInc)
				program[info.Offset] = byte(jmpInc)
				if info.Width == 16 {
    
    
					program[info.Offset+1] = byte((jmpInc >> 8) & 0xff)
				}
			}

		} else {
    
     // 内部标号
			//look up in seg map
			if seg, ok := segMap[info.Name]; !ok {
    
    
				log.Fatalf("cant't find seg label %s in seg table", info.Name)
			} else {
    
    
				fmt.Printf("find seg label \"%s\", offset %d\n", info.Name, info.Offset)
				var relocType RelocationType
				switch seg {
    
    
				case "cs":
					relocType = CodeSegementRelocation
				case "ds":
					relocType = DataSegementRelocation
				case "ss":
					relocType = StackSegementRelocation
				}

				// 记录重定位信息
				relocationInfos = append(relocationInfos,
					RelocationInfo{
    
    info.Offset, 16, relocType})
			}
		}
	}
}

对于 offset 标号，直接查找 labelMap 获取它的值【代码段中的偏移量】，然后写入程序中。

对于 jmp 标号，先查找 labelMap 获取它的值【代码段中的偏移量】，然后减去 jmp 机器指令下一条指令在代码段中的偏移量就得到实际 jmp 机器指令中偏移量的值：

var jmpInc int16 = int16(v - info.JmpInc)

偏移量有可能是负数，所以要表示成 int16。

重定位信息

对内部标号的处理比较特殊，因为它的值代表程序实际执行时的代码段，数据段或堆栈段的值，而在编译时是不能确定它的值，只有程序被加载时才能确定！

所以我们需要将这些标号在程序中的偏移量信息记录下来，告诉加载程序【后面要实现的8086虚拟机】。这样加载程序才能在加载时将它们修改成运行时的值。我们将这些信息称作重定位信息。因此需要将程序添加一个程序头存放这些重定位信息。

重定位信息结构体定义如下：

type RelocationType uint8

const (
	CodeSegementRelocation RelocationType = iota
	DataSegementRelocation
	StackSegementRelocation
)

type RelocationInfo struct {
    
    
	Offset uint32         // 标号在程序中的偏移量
	Width  uint8          // 标号值的宽度
	Type   RelocationType //指明是修改成当前代码段或数据段或堆栈段的值
}

有了重定位信息之后，我们要将这些信息放入程序中，一般都放在程序头部，因此我们定义程序头：

添加程序头

程序头结构体定义如下：


const programHeaderLen = 256 // 程序头部最大长度

type ProgramHeader struct {
    
    
	codeSegProgOffset   uint32 // 代码段在程序中的偏移量
	codeEntryProgOffset uint32 // 程序入口偏移量
	dataSegProgOffset   uint32 // 数据段在程序中的偏移量
	stackSegProgOffset  uint32 // 堆栈段在程序中的偏移量
	relocationLen       uint32 //有多少条重定位信息，最多39条
	relocationInfos     []RelocationInfo // 重定位信息
}

由于我们人工规定最大程序头长度为256，而一条重定位信息长度为6，所以最多只能有39条重定位信息。

程序头中除了包含重定位信息外，还包含代码段、数据段、堆栈段在程序中的偏移量和程序入口偏移量。

在加载程序加载这个程序时【已经确定了 cs 寄存器的值】，根据这些偏移量就可以计算出 ds，ss 寄存器的值，根据重定位信息将程序做相应的修改，然后将 ip 寄存器设置为程序入口偏移量，整个程序就可以运行起来了。

生成程序头部的代码如下：

		ph := ProgramHeader{
    
    
			codeSegProgOffset,
			codeEntryOffset,
			dataSegProgOffset,
			stackSegProgOffset,
			uint32(len(relocationInfos)),
			relocationInfos,
		}

		header := new(bytes.Buffer)
		binary.Write(header, binary.LittleEndian, ph.codeSegProgOffset)
		binary.Write(header, binary.LittleEndian, ph.codeEntryProgOffset)
		binary.Write(header, binary.LittleEndian, ph.dataSegProgOffset)
		binary.Write(header, binary.LittleEndian, ph.stackSegProgOffset)
		binary.Write(header, binary.LittleEndian, ph.relocationLen)
		binary.Write(header, binary.LittleEndian, ph.relocationInfos)
		header.Write(make([]byte, programHeaderLen-header.Len()))

生成可执行程序

最后将我们生成的可执行程序写入文件中：

		f, err := os.Create("./a.exec")
		if err != nil {
    
    
			log.Fatal("create file failed:", err)
		}

		defer f.Close()

		// 写入程序头
		f.Write(header.Bytes())
		// 写入程序
		f.Write(program)
		f.Sync()

大功告成。
以第一个程序为例：

assume cs:code,ds:data,ss:stack     ;将cs,ds,ss分别和code,data,stack段相连
data segment
  dw 0123h, 0456h, 0789h, 0abch, 0defh, 0fedh, 0cbah, 0987h
data ends

stack segment
  dw 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
stack ends
code segment
  start: mov ax,stack
         mov ss,ax
         mov sp,20h         ; 将设置栈顶ss:sp指向stack:20

         mov ax, data        ; 将名称为"data"的段的段地址送入ax
         mov ds,ax          ; ds指向data段

         mov bx,0           ; ds:bx指向data段中的第一个单元
         mov cx,8

    s0: push cs:[bx]
        add bx,2
        loop s0             ; 以上将代码段0~15单元总的8个字型数据依次入栈

        mov bx,0
        mov cx, 8
    s1:pop cs:[bx]
        add bx,2
        loop s1             ; 以上依次出栈8个字型数据到代码段0~15单元中

  mov ax,4c00h
  int 21h
code ends
end start

看下生成的程序格式：

root@ubuntu:~/gogo/demo# hexdump a.exec 
0000000 0030 0000 0000 0000 0000 0000 0010 0000
0000010 0002 0000 0031 0000 0210 0039 0000 0110
0000020 0000 0000 0000 0000 0000 0000 0000 0000
*
0000100 0123 0456 0789 0abc 0def 0fed 0cba 0987
0000110 0000 0000 0000 0000 0000 0000 0000 0000
*
0000130 00b8 8e00 bcd0 0020 00b8 8e00 bbd8 0000
0000140 08b9 2e00 37ff c381 0002 f7e2 00bb b900
0000150 0008 8f2e 8107 02c3 e200 b8f7 4c00 21cd
0000160

后面文章介绍实现 8086 虚拟机运行可执行程序。