AST abstract syntax tree
Reference: Vue Source Code Analysis Series Courses
Series notes:
- [Front-end source code analysis] The core principle of mustache template engine
- [Front-end source code analysis] The core principle of virtual DOM
- [Front-end source code analysis] Data responsive principle
- [Front-end source code analysis] AST abstract syntax tree
- [Front-end source code analysis] Instructions and life cycle
Outline:
- Relevant Algorithm Reserve
- AST Formation Algorithm
- Handwritten AST compiler
- Handwritten text parsing function
- AST optimization
- Generate h() function from AST
Source code of this chapter: https://gitee.com/szluyu99/vue-source-learn/tree/master/AST_Study
Introduction to Abstract Syntax Trees
What is an Abstract Syntax Tree?
- It is very difficult to compile directly from template syntax to HTML syntax
- Transitioning through Abstract Syntax Trees makes this easy
Abstract syntax tree is essentially a JS object :
- It is understood as the JS object corresponding to the HTML grammar through a series of rules
The relationship between abstract syntax tree and virtual node :
Relevant Algorithm Reserve
Related code: https://gitee.com/szluyu99/vue-source-learn/tree/master/AST_Study/pre
pointer
Question: Try to find the character with the most consecutive repetitions in a string.
function findMaxChar(str) {
let i = 0, j = 1
let maxChar = str[i], maxRepeat = -1
while (i < str.length) {
if (str[i] != str[j]) {
if (j - i > maxRepeat) {
maxRepeat = j - i
maxChar = str[i]
}
i = j
}
j++
}
console.log(`最多的字母是 ${
maxChar},重复了 ${
maxRepeat} 次`);
}
let str = 'aaaabbbbbcccccccccccccdddddd'
findMaxChar(str)
recursion
Topic 1: Try to output the first 10 items of the Fibonacci sequence, namely 1, 1, 2, 3, 5, 8, 13, 21, 34, 55. Then please think, does the code have a lot of repeated calculations? How to solve the problem of double counting
Uncached recursion:
function fib(n) {
return n <= 1 ? n : fib(n - 1) + fib(n - 2)
}
Recursion with caching:
let cache = {
}
function fib(n) {
if (n in cache) return cache[n]
return cache[n] = (n <= 1 ? n : fib(n - 1) + fib(n - 2))
}
You can add
console.count()
to to count the number of times the function is called
Topic 2: Try to convert the high-dimensional array [1, 2, [3, [4, 5], 6], 7, [8], 9]
into the object shown below
{
children: [
{
value: 1 },
{
value: 2 },
{
children: [
{
value: 3 },
{
children: [{
value: 4 }, {
value: 5 }] },
{
value: 6 }
]
},
{
value: 7 },
{
children: [{
value: 8}] },
{
value: 9 }
]
};
// 转换函数1
function convert(arr) {
return {
children: help(arr)}
}
function help(arr) {
let res = [];
for (let i = 0; i < arr.length; i++) {
if (typeof arr[i] === "number") {
res.push({
value: arr[i] });
} else if (Array.isArray(arr[i])) {
res.push({
children: help(arr[i]) });
}
}
return res;
}
// 转换函数2
function convert(item) {
if (typeof item == 'number')
return {
value: item }
if (Array.isArray(item))
return {
children: item.map(_item => convert(_item)) }
}
the stack
In JavaScript, a stack can be simulated by an array, using
push()
andpop()
Topic: Try to write the "smart repeat" smartRepeat function to achieve:
- will
3[abc]
becomeabcabcabc
- will
3[2[a]2[b]]
becomeaabbaabbaabb
- will
2[1[a]3[b]2[3[c]4[d]]]
becomeabbbcccddddcccddddabbbcccddddcccdddd
Don't consider the case where the input string is illegal, such as:
2[a3[b]]
is wrong, it should be filled with a 1, that is2[1[a]3[b]]
[abc]
is wrong, it should be filled with a 1, that is1[abc]
Generally, when encountering parentheses and needing to do lexical analysis, the stack is often used
Idea: traverse each character
- If the character is a number, push the number to stack A and the null character to stack B
- If the character is a letter, change the top of stack B to the letter
- If the character is
]
, stack A and stack B are popped separately, combined, and the combination result is spliced behind the string at the top of stack B
function smartRepeact(templateStr) {
let stackA = [] // 存放数字
let stackB = [] // 存放临时字符串
let rest = templateStr // 剩余字符串
let idx = 0 // 指针
while (idx < templateStr.length - 1) {
rest = templateStr.substring(idx) // 更新剩余字符串
if (/^\d+\[/.test(rest)) {
// 判断是否以数字和 [ 开头
// 取出开头的数字
let times = Number(rest.match(/^(\d+)\[/)[1])
stackA.push(times)
stackB.push('')
idx += times.toString().length + 1
} else if(/^\w+\]/.test(rest)) {
// 判断是否以字母和 ] 开头
// 如果这个字符是字母,那么就把B栈顶这项改为这个字母
// 取出开头的字母
let word = rest.match(/^(\w+)\]/)[1]
stackB[stackB.length - 1] = word
idx += word.length
} else if (rest[0] === ']') {
// 如果这个字符是 ],组合两个栈的结果并放入栈B
let times = stackA.pop() // 取出栈顶的数字
let word = stackB.pop() // 取出栈顶的字符串
stackB[stackB.length - 1] += word.repeat(times)
idx++
}
}
return stackB[0].repeat(stackA[0])
}
regular expression
Add a little knowledge of regular expressions commonly used in JS:
// replace() 方法用于在字符串中用一些字符替换另一些字符,或替换一个与正则表达式匹配的子串
'abc666def123'.replace(/\d/g, '') // abcdef
// search() 方法用于检索字符串中指定的子字符串,或检索与正则表达式相匹配的子字符串
// 找不到则返回 -1
'abc666def123'.search(/\d/g) // 3
// match() 方法可在字符串内检索指定的值,或找到一个或多个正则表达式的匹配
// 找不到则返回 null
'abc666def123'.match(/\d/g) // ['6', '6', '6', '1', '2', '3']
// test() 方法用于检测一个字符串是否匹配某个模式
/^\d/.test('5abc') // true
/^\d/.test('abc') // false
'555[abc]'.match(/^\d+\[/)
// ['555[', index: 0, input: '555[abc]', groups: undefined]
// () 表示捕获
'555[abc]'.match(/(^\d+)\[/)
// ['555[', '555', index: 0, input: '555[abc]', groups: undefined]
Handwritten AST abstract syntax tree
Source code of this chapter: https://gitee.com/szluyu99/vue-source-learn/tree/master/AST_Study
Identify start and end tags
For relevant knowledge, refer to the content of "Algorithm Reserve - Stack" above
parse.js
:
export default function parse(templateStr) {
let rest = ''
// 开始标签的正则
const startRegExp = /^\<([a-z]+[1-6]?)\>/
// 结束标签的正则
const endRegExp = /^\<\/([a-z]+[1-6]?)\>/
// 结束标签前文字的正则(注意开头不含 <)
const wordRegExp = /^([^\<]+)\<\/[a-z]+[1-6]?\>/
let stackA = [], stackB = []
let index = 0
while (index < templateStr.length - 1) {
rest = templateStr.substring(index)
if (startRegExp.test(rest)) {
// 识别遍历到的字符,是 开始标签
let tag = rest.match(startRegExp)[1];
// console.log(`检测到开始标记:<${tag}>`);
stackA.push(tag) // 将开始标记推入 栈A
stackB.push([]) // 将空数组推入 栈B
// 移动开始标签的长度,由于 <> 是两个字符,所以需要 + 2
index += tag.length + 2
} else if (endRegExp.test(rest)) {
// 识别遍历到的字符,是 结束标签
let tag = rest.match(endRegExp)[1];
// console.log(`检测到结束标记:</${tag}>`);
// 此时,tag 一定和 栈A 顶部是相同的
if (tag === stackA[stackA.length - 1]) {
stackA.pop()
} else {
throw new Error(`${
stackA[stackA.length - 1]} 标签没有封闭`)
}
// 移动结束标签的长度,由于 </.> 是两个字符,所以需要 + 3
index += tag.length + 3
} else if (wordRegExp.test(rest)) {
// 识别到遍历的字符,是 文字(并且不能为全空)
let word = rest.match(wordRegExp)[1];
if (!/^\s+$/.test(word)) {
console.log(`检测到文字:${
word}`);
}
// 指针移动到文字的末尾
index += word.length
} else {
index++
}
// 未考虑文字在标签后面的情况,如:
// <p>123</p> hello
}
}
Using a stack to form an AST
Achieve the desired effect:
let htmlStr = `<div>
<h1>Hello</h1>
<ul>
<li>111</li>
<li>222</li>
<li>333</li>
</ul>
</div>`;
const ast = parse(htmlStr)
console.log(ast);
parse.js
:
export default function parse(templateStr) {
let rest = "";
// 开始标签的正则
const startRegExp = /^\<([a-z]+[1-6]?)\>/;
// 结束标签的正则
const endRegExp = /^\<\/([a-z]+[1-6]?)\>/;
// 结束标签前文字的正则(注意开头不含 <)
const wordRegExp = /^([^\<]+)\<\/[a-z]+[1-6]?\>/;
let stackA = [];
let stackB = [{
children: [] }];
let index = 0;
while (index < templateStr.length - 1) {
rest = templateStr.substring(index);
if (startRegExp.test(rest)) {
// 识别遍历到的字符,是 开始标签
let tag = rest.match(startRegExp)[1];
console.log(`检测到开始标记:<${
tag}>`);
stackA.push(tag); // 将开始标记推入 栈A
stackB.push({
tag: tag, children: [] }); // 将空数组推入 栈B
// 移动开始标签的长度,由于 <> 是两个字符,所以需要 + 2
index += tag.length + 2;
// console.log(stackA, stackB);
} else if (endRegExp.test(rest)) {
// 识别遍历到的字符,是 结束标签
let tag = rest.match(endRegExp)[1];
console.log(`检测到结束标记:</${
tag}>`);
let pop_tag = stackA.pop();
// 此时,tag 一定和 栈A 顶部是相同的
if (tag === pop_tag) {
let pop_arr = stackB.pop();
if (stackB.length > 0) {
stackB[stackB.length - 1].children.push(pop_arr);
}
} else {
throw new Error(`${
stackA[stackA.length - 1]} 标签没有封闭`);
}
// 移动结束标签的长度,由于 </.> 是两个字符,所以需要 + 3
index += tag.length + 3;
// console.log(stackA, stackB);
} else if (wordRegExp.test(rest)) {
// 识别到遍历的字符不是 文字(并且不能为全空)
let word = rest.match(wordRegExp)[1];
// 文字不能全是空
if (!/^\s+$/.test(word)) {
console.log(`检测到文字:${
word}`);
// 改变此时 stackB 中的栈顶元素
stackB[stackB.length - 1].children.push({
text: word, type: 3 });
console.log(stackB);
}
// 指针移动到文字的末尾
index += word.length;
} else {
index++;
}
// 未考虑文字在标签后面的情况,如:
// <p>123</p> hello
}
console.log(stackB);
return stackB[0].children[0]
}
Identify Attrs
parseAttrsString.js
: parse attrsString into an array of attrs objects
- String before parsing:
class="box red" id="mybox"
- Parsed array of objects:
[
{
name: 'class',
value: 'box red'
},
{
name: 'id',
value: 'mybox'
}
]
The core idea of parseAttrsString algorithm:
- Traversing attrsStr, if a space is encountered and it is not in quotation marks, add the string from the previous breakpoint to the current one to result
/**
* 把 attrsString 解析成 attrs 对象数组
*/
export default function parseAttrsString(attrsStr) {
if (!attrsStr) return []
let inFlag = false // 当前是否处于引号内
let point = 0 // 断点处
let result = []
// 遍历 attrsStr,不能直接用 split(),有如下情况 class="aa bb cc" id="gg"
for (let i = 0; i < attrsStr.length; i++) {
let c = attrsStr[i]
if (c === '"') inFlag = !inFlag // 遇到 双引号,切换 inFlag 状态
else if (c === ' ' && !inFlag) {
// 遇见 空格,且不在引号中
if (!/^\s*$/.test(attrsStr.substring(point, i))) {
// 不全为空格
result.push(attrsStr.substring(point, i).trim())
point = i
}
}
}
// 循环结束后,还剩一个属性
result.push(attrsStr.substring(point).trim())
// 将 ["k1=v1", "k2=v2"] 变为 [{name: k1, value: v1}, {name: k2, value: v2}]
result = result.map(item => {
const o = item.match(/^(.+)="(.+)"$/)
return {
name: o[1], value: o[2] }
})
return result
}
parse.js
:
export default function parse(templateStr) {
let rest = "";
// 开始标签的正则
const startRegExp = /^\<([a-z]+[1-6]?)(\s[^\<]+)?\>/;
// 结束标签的正则
const endRegExp = /^\<\/([a-z]+[1-6]?)\>/;
// 结束标签前文字的正则(注意开头不含 <)
const wordRegExp = /^([^\<]+)\<\/[a-z]+[1-6]?\>/;
let stackA = [];
let stackB = [{
children: [] }];
let index = 0;
while (index < templateStr.length - 1) {
rest = templateStr.substring(index); // 更新剩余字符串
if (startRegExp.test(rest)) {
// 识别遍历到的字符,是 开始标签
let tag = rest.match(startRegExp)[1]; // 标签内容
let attrsString = rest.match(startRegExp)[2] // attr 内容
// console.log(`检测到开始标记:<${tag}>`);
stackA.push(tag); // 将开始标记推入 栈A
stackB.push({
tag: tag,
children: [],
attrs: parseAttrsString(attrsString) // 解析属性字符串
}); // 将空数组推入 栈B
// 移动开始标签的长度,由于 <> 是两个字符,所以需要 + 2,还需加上 attrs 的长度
index += tag.length + 2 + (attrsString?.length || 0);
// console.log(stackA, stackB);
} else if (endRegExp.test(rest)) {
// 识别遍历到的字符,是 结束标签
let tag = rest.match(endRegExp)[1]; // 标签内容
// console.log(`检测到结束标记:</${tag}>`);
let pop_tag = stackA.pop(); // 栈A 顶部元素
// 此时,tag 一定和 栈A 顶部是相同的
if (tag === pop_tag) {
let pop_arr = stackB.pop();
if (stackB.length > 0) {
stackB[stackB.length - 1].children.push(pop_arr);
}
} else {
throw new Error(`${
pop_tag} 标签没有封闭`);
}
// 移动结束标签的长度,由于 </> 是两个字符,所以需要 + 3
index += tag.length + 3;
// console.log(stackA, stackB);
} else if (wordRegExp.test(rest)) {
// 识别到遍历的字符,是 文字(并且不能为全空)
let word = rest.match(wordRegExp)[1];
// 文字不能全是空
if (!/^\s+$/.test(word)) {
// console.log(`检测到文字:${word}`);
// 改变此时 stackB 中的栈顶元素
stackB[stackB.length - 1].children.push({
text: word, type: 3 });
// console.log(stackB);
}
// 指针移动到文字的末尾
index += word.length;
} else {
index++;
}
// 未考虑文字在标签后面的情况,如:
// <p>123</p> hello
}
return stackB[0].children[0]
}