[Front-end source code analysis] AST abstract syntax tree

Reference: Vue Source Code Analysis Series Courses

Series notes:

Outline:

  • Relevant Algorithm Reserve
  • AST Formation Algorithm
  • Handwritten AST compiler
  • Handwritten text parsing function
  • AST optimization
  • Generate h() function from AST

Source code of this chapter: https://gitee.com/szluyu99/vue-source-learn/tree/master/AST_Study

Introduction to Abstract Syntax Trees

What is an Abstract Syntax Tree?

  • It is very difficult to compile directly from template syntax to HTML syntax
  • Transitioning through Abstract Syntax Trees makes this easy

Abstract syntax tree is essentially a JS object :

  • It is understood as the JS object corresponding to the HTML grammar through a series of rules

The relationship between abstract syntax tree and virtual node :

Relevant Algorithm Reserve

Related code: https://gitee.com/szluyu99/vue-source-learn/tree/master/AST_Study/pre

pointer

Question: Try to find the character with the most consecutive repetitions in a string.

function findMaxChar(str) {
    
    
    let i = 0, j = 1
    let maxChar = str[i], maxRepeat = -1

    while (i < str.length) {
    
    
        if (str[i] != str[j]) {
    
    
            if (j - i > maxRepeat) {
    
    
                maxRepeat = j - i
                maxChar = str[i]
            }
            i = j
        }
        j++
    }
    console.log(`最多的字母是 ${
      
      maxChar},重复了 ${
      
      maxRepeat}`);
}

let str = 'aaaabbbbbcccccccccccccdddddd'
findMaxChar(str)

recursion

Topic 1: Try to output the first 10 items of the Fibonacci sequence, namely 1, 1, 2, 3, 5, 8, 13, 21, 34, 55. Then please think, does the code have a lot of repeated calculations? How to solve the problem of double counting

Uncached recursion:

function fib(n) {
    
    
    return n <= 1 ? n : fib(n - 1) + fib(n - 2)
}

Recursion with caching:

let cache = {
    
    }
function fib(n) {
    
    
    if (n in cache) return cache[n]
    return cache[n] = (n <= 1 ? n : fib(n - 1) + fib(n - 2))
}

You can add console.count()to to count the number of times the function is called

Topic 2: Try to convert the high-dimensional array [1, 2, [3, [4, 5], 6], 7, [8], 9]into the object shown below

{
    
    
  children: [
    {
    
     value: 1 },
    {
    
     value: 2 },
    {
    
    
      children: [
        {
    
     value: 3 },
        {
    
     children: [{
    
     value: 4 }, {
    
     value: 5 }] },
        {
    
     value: 6 }
      ]
    },
    {
    
     value: 7 },
    {
    
     children: [{
    
    value: 8}] },
    {
    
     value: 9 }
  ]
};
// 转换函数1
function convert(arr) {
    
    
    return {
    
    children: help(arr)}
}

function help(arr) {
    
    
  let res = [];
  for (let i = 0; i < arr.length; i++) {
    
    
    if (typeof arr[i] === "number") {
    
    
      res.push({
    
     value: arr[i] });
    } else if (Array.isArray(arr[i])) {
    
    
      res.push({
    
     children: help(arr[i]) });
    }
  }
  return res;
}
// 转换函数2
function convert(item) {
    
    
  if (typeof item == 'number')
    return {
    
     value: item }
  if (Array.isArray(item))
    return {
    
     children: item.map(_item => convert(_item)) }
}

the stack

In JavaScript, a stack can be simulated by an array, using push()andpop()

Topic: Try to write the "smart repeat" smartRepeat function to achieve:

  • will 3[abc]becomeabcabcabc
  • will 3[2[a]2[b]]becomeaabbaabbaabb
  • will 2[1[a]3[b]2[3[c]4[d]]]becomeabbbcccddddcccddddabbbcccddddcccdddd

Don't consider the case where the input string is illegal, such as:

  • 2[a3[b]]is wrong, it should be filled with a 1, that is2[1[a]3[b]]
  • [abc]is wrong, it should be filled with a 1, that is1[abc]

Generally, when encountering parentheses and needing to do lexical analysis, the stack is often used

Idea: traverse each character

  • If the character is a number, push the number to stack A and the null character to stack B
  • If the character is a letter, change the top of stack B to the letter
  • If the character is ], stack A and stack B are popped separately, combined, and the combination result is spliced ​​behind the string at the top of stack B
function smartRepeact(templateStr) {
    
    
    let stackA = [] // 存放数字
    let stackB = [] // 存放临时字符串
    let rest = templateStr // 剩余字符串

    let idx = 0 // 指针
    while (idx < templateStr.length - 1) {
    
    
        rest = templateStr.substring(idx) // 更新剩余字符串
        if (/^\d+\[/.test(rest)) {
    
     // 判断是否以数字和 [ 开头
            // 取出开头的数字
            let times = Number(rest.match(/^(\d+)\[/)[1])
            stackA.push(times)
            stackB.push('')
            idx += times.toString().length + 1
        } else if(/^\w+\]/.test(rest)) {
    
     // 判断是否以字母和 ] 开头
            // 如果这个字符是字母,那么就把B栈顶这项改为这个字母
            // 取出开头的字母
            let word = rest.match(/^(\w+)\]/)[1]
            stackB[stackB.length - 1] = word
            idx += word.length
        } else if (rest[0] === ']') {
    
    
            // 如果这个字符是 ],组合两个栈的结果并放入栈B
            let times = stackA.pop() // 取出栈顶的数字
            let word = stackB.pop() // 取出栈顶的字符串
            stackB[stackB.length - 1] += word.repeat(times)
            idx++
        }
    }
    return stackB[0].repeat(stackA[0])
}

regular expression

Add a little knowledge of regular expressions commonly used in JS:

// replace() 方法用于在字符串中用一些字符替换另一些字符,或替换一个与正则表达式匹配的子串
'abc666def123'.replace(/\d/g, '') // abcdef

// search() 方法用于检索字符串中指定的子字符串,或检索与正则表达式相匹配的子字符串
// 找不到则返回 -1
'abc666def123'.search(/\d/g)  // 3

// match() 方法可在字符串内检索指定的值,或找到一个或多个正则表达式的匹配
// 找不到则返回 null
'abc666def123'.match(/\d/g) // ['6', '6', '6', '1', '2', '3']

// test() 方法用于检测一个字符串是否匹配某个模式
/^\d/.test('5abc') // true
/^\d/.test('abc') // false
'555[abc]'.match(/^\d+\[/)
// ['555[', index: 0, input: '555[abc]', groups: undefined]

// () 表示捕获
'555[abc]'.match(/(^\d+)\[/)
// ['555[', '555', index: 0, input: '555[abc]', groups: undefined]

Handwritten AST abstract syntax tree

Source code of this chapter: https://gitee.com/szluyu99/vue-source-learn/tree/master/AST_Study

Identify start and end tags

For relevant knowledge, refer to the content of "Algorithm Reserve - Stack" above

parse.js

export default function parse(templateStr) {
    
    
    let rest = ''

    // 开始标签的正则
    const startRegExp = /^\<([a-z]+[1-6]?)\>/
    // 结束标签的正则
    const endRegExp = /^\<\/([a-z]+[1-6]?)\>/
    // 结束标签前文字的正则(注意开头不含 <)
    const wordRegExp = /^([^\<]+)\<\/[a-z]+[1-6]?\>/

    let stackA = [], stackB = []

    let index = 0
    while (index < templateStr.length - 1) {
    
    
        rest = templateStr.substring(index)
        if (startRegExp.test(rest)) {
    
     // 识别遍历到的字符,是 开始标签
            let tag = rest.match(startRegExp)[1];
            // console.log(`检测到开始标记:<${tag}>`);
            stackA.push(tag) // 将开始标记推入 栈A
            stackB.push([]) // 将空数组推入 栈B
            // 移动开始标签的长度,由于 <> 是两个字符,所以需要 + 2
            index += tag.length + 2
        } else if (endRegExp.test(rest)) {
    
     // 识别遍历到的字符,是 结束标签
            let tag = rest.match(endRegExp)[1];
            // console.log(`检测到结束标记:</${tag}>`);
            // 此时,tag 一定和 栈A 顶部是相同的
            if (tag === stackA[stackA.length - 1]) {
    
    
                stackA.pop()
            } else {
    
    
                throw new Error(`${
      
      stackA[stackA.length - 1]} 标签没有封闭`)
            }
            // 移动结束标签的长度,由于 </.> 是两个字符,所以需要 + 3
            index += tag.length + 3
        } else if (wordRegExp.test(rest)) {
    
     // 识别到遍历的字符,是 文字(并且不能为全空)
            let word = rest.match(wordRegExp)[1];
            if (!/^\s+$/.test(word)) {
    
    
                console.log(`检测到文字:${
      
      word}`);
            }
            // 指针移动到文字的末尾
            index += word.length
        } else {
    
    
            index++
        }

        // 未考虑文字在标签后面的情况,如:
        // <p>123</p> hello
    }
}

Using a stack to form an AST

Achieve the desired effect:

let htmlStr = `<div>
    <h1>Hello</h1>
    <ul>
        <li>111</li>
        <li>222</li>
        <li>333</li>
    </ul>
</div>`;

const ast = parse(htmlStr)
console.log(ast);

parse.js

export default function parse(templateStr) {
    
    
  let rest = "";

  // 开始标签的正则
  const startRegExp = /^\<([a-z]+[1-6]?)\>/;
  // 结束标签的正则
  const endRegExp = /^\<\/([a-z]+[1-6]?)\>/;
  // 结束标签前文字的正则(注意开头不含 <)
  const wordRegExp = /^([^\<]+)\<\/[a-z]+[1-6]?\>/;

  let stackA = [];
  let stackB = [{
    
     children: [] }];

  let index = 0;
  while (index < templateStr.length - 1) {
    
    
    rest = templateStr.substring(index);
    if (startRegExp.test(rest)) {
    
     // 识别遍历到的字符,是 开始标签
      let tag = rest.match(startRegExp)[1];
      console.log(`检测到开始标记:<${
      
      tag}>`);
      stackA.push(tag); // 将开始标记推入 栈A
      stackB.push({
    
     tag: tag, children: [] }); // 将空数组推入 栈B
      // 移动开始标签的长度,由于 <> 是两个字符,所以需要 + 2
      index += tag.length + 2;

      // console.log(stackA, stackB);
    } else if (endRegExp.test(rest)) {
    
     // 识别遍历到的字符,是 结束标签
      let tag = rest.match(endRegExp)[1];
      console.log(`检测到结束标记:</${
      
      tag}>`);
      let pop_tag = stackA.pop();
      // 此时,tag 一定和 栈A 顶部是相同的
      if (tag === pop_tag) {
    
    
        let pop_arr = stackB.pop();
        if (stackB.length > 0) {
    
    
          stackB[stackB.length - 1].children.push(pop_arr);
        }
      } else {
    
    
        throw new Error(`${
      
      stackA[stackA.length - 1]} 标签没有封闭`);
      }
      // 移动结束标签的长度,由于 </.> 是两个字符,所以需要 + 3
      index += tag.length + 3;

      // console.log(stackA, stackB);
    } else if (wordRegExp.test(rest)) {
    
     // 识别到遍历的字符不是 文字(并且不能为全空)
      let word = rest.match(wordRegExp)[1];
      // 文字不能全是空
      if (!/^\s+$/.test(word)) {
    
    
        console.log(`检测到文字:${
      
      word}`);
        // 改变此时 stackB 中的栈顶元素
        stackB[stackB.length - 1].children.push({
    
     text: word, type: 3 });
        console.log(stackB);
      }
      // 指针移动到文字的末尾
      index += word.length;
    } else {
    
    
      index++;
    }

    // 未考虑文字在标签后面的情况,如:
    // <p>123</p> hello
  }
  console.log(stackB);
  return stackB[0].children[0]
}

Identify Attrs

parseAttrsString.js: parse attrsString into an array of attrs objects

  • String before parsing:class="box red" id="mybox"
  • Parsed array of objects:
[
	{
    
     
	 name: 'class', 
	 value: 'box red'
	},
	{
    
     
	 name: 'id', 
	 value: 'mybox' 
	}
]

The core idea of ​​parseAttrsString algorithm:

  • Traversing attrsStr, if a space is encountered and it is not in quotation marks, add the string from the previous breakpoint to the current one to result
/**
 * 把 attrsString 解析成 attrs 对象数组
 */
export default function parseAttrsString(attrsStr) {
    
    
    if (!attrsStr) return []

    let inFlag = false // 当前是否处于引号内
    let point = 0 // 断点处

    let result = []
    // 遍历 attrsStr,不能直接用 split(),有如下情况 class="aa bb cc" id="gg"
    for (let i = 0; i < attrsStr.length; i++) {
    
    
        let c = attrsStr[i]
        if (c === '"') inFlag = !inFlag // 遇到 双引号,切换 inFlag 状态
        else if (c === ' ' && !inFlag) {
    
     // 遇见 空格,且不在引号中
            if (!/^\s*$/.test(attrsStr.substring(point, i))) {
    
    
                // 不全为空格
                result.push(attrsStr.substring(point, i).trim())
                point = i
            }
        }
    }
    // 循环结束后,还剩一个属性
    result.push(attrsStr.substring(point).trim())
    // 将 ["k1=v1", "k2=v2"] 变为 [{name: k1, value: v1}, {name: k2, value: v2}]
    result = result.map(item => {
    
    
        const o = item.match(/^(.+)="(.+)"$/)
        return {
    
     name: o[1], value: o[2] }
    })
    return result
}

parse.js

export default function parse(templateStr) {
    
    
  let rest = "";

  // 开始标签的正则
  const startRegExp = /^\<([a-z]+[1-6]?)(\s[^\<]+)?\>/;
  // 结束标签的正则
  const endRegExp = /^\<\/([a-z]+[1-6]?)\>/;
  // 结束标签前文字的正则(注意开头不含 <)
  const wordRegExp = /^([^\<]+)\<\/[a-z]+[1-6]?\>/;

  let stackA = [];
  let stackB = [{
    
     children: [] }];

  let index = 0;
  while (index < templateStr.length - 1) {
    
    
    rest = templateStr.substring(index); // 更新剩余字符串
    if (startRegExp.test(rest)) {
    
     // 识别遍历到的字符,是 开始标签
      let tag = rest.match(startRegExp)[1]; // 标签内容
      let attrsString = rest.match(startRegExp)[2] // attr 内容
    //   console.log(`检测到开始标记:<${tag}>`);
      stackA.push(tag); // 将开始标记推入 栈A
      stackB.push({
    
     
        tag: tag, 
        children: [], 
        attrs: parseAttrsString(attrsString) // 解析属性字符串
      }); // 将空数组推入 栈B
      // 移动开始标签的长度,由于 <> 是两个字符,所以需要 + 2,还需加上 attrs 的长度
      index += tag.length + 2 + (attrsString?.length || 0);
      // console.log(stackA, stackB);
    } else if (endRegExp.test(rest)) {
    
     // 识别遍历到的字符,是 结束标签
      let tag = rest.match(endRegExp)[1]; // 标签内容
    //   console.log(`检测到结束标记:</${tag}>`);
      let pop_tag = stackA.pop(); // 栈A 顶部元素
      // 此时,tag 一定和 栈A 顶部是相同的
      if (tag === pop_tag) {
    
    
        let pop_arr = stackB.pop();
        if (stackB.length > 0) {
    
    
          stackB[stackB.length - 1].children.push(pop_arr);
        }
      } else {
    
    
        throw new Error(`${
      
      pop_tag} 标签没有封闭`);
      }
      // 移动结束标签的长度,由于 </> 是两个字符,所以需要 + 3
      index += tag.length + 3;
      // console.log(stackA, stackB);
    } else if (wordRegExp.test(rest)) {
    
     // 识别到遍历的字符,是 文字(并且不能为全空)
      let word = rest.match(wordRegExp)[1];
      // 文字不能全是空
      if (!/^\s+$/.test(word)) {
    
    
        // console.log(`检测到文字:${word}`);
        // 改变此时 stackB 中的栈顶元素
        stackB[stackB.length - 1].children.push({
    
     text: word, type: 3 });
        // console.log(stackB);
      }
      // 指针移动到文字的末尾
      index += word.length;
    } else {
    
    
      index++;
    }

    // 未考虑文字在标签后面的情况,如:
    // <p>123</p> hello
  }
  return stackB[0].children[0]
}

Guess you like

Origin blog.csdn.net/weixin_43734095/article/details/125517711