如何写一个简单的解释器-1

Lan的源代码由一些基本元素构成，我们称之为Token，在词法分析阶段我们需要将输入的字符流转化成Token流（简单说就是Token列表）。

下面是Token的类型定义，为了节省资源采用整数表示而不用枚举类型。

public class TokenType {
    public static final int PLUS        = 0;//("+")
    public static final int PLUSPLUS    = 1;//("++")
    public static final int MINUS       = 2;//("-")
    public static final int MINUSMINUS  = 3;//("--")
    public static final int ASTERISK    = 4;//("*")
    public static final int SLASH       = 5;//("/")
    public static final int PERCENT     = 6;//("%")
    public static final int EQUAL       = 7;//("==")
    public static final int NOT_EQUAL   = 8;//("!=")
    public static final int GT          = 9;//(">")
    public static final int GE          = 10;//(">=")
    public static final int LT          = 11;//("<")
    public static final int LE          = 12;//("<=")
    public static final int AND         = 13;//("&&")
    public static final int OR          = 14;//("||")
    public static final int BANG        = 15;//("!")
    public static final int LEFT_PAREN  = 16;//("(")
    public static final int RIGHT_PAREN = 17;//(")")
    public static final int LEFT_BRACE  = 18;//("{")
    public static final int RIGHT_BRACE = 19;//("}")
    public static final int COMMA       = 20;//(",")
    public static final int QUESTION    = 21;//("?")
    public static final int COLON       = 22;//(":")
    public static final int NUMBER      = 23;//("数值")
    public static final int STRING      = 24;//("字符串")
    public static final int ASSIGN      = 25;//("=")
    public static final int TRUE        = 26;//("true")
    public static final int FALSE       = 27;//("false")
    public static final int NULL        = 28;//("null")
    public static final int IDENTIFIER  = 29;//("变量名")
    public static final int IF          = 30;//("if")
    public static final int ELSE        = 31;//("else")
    public static final int WHILE       = 32;//("while")
    public static final int BREAK       = 33;//("break")
    public static final int CONTINUE    = 34;//("continue")
    public static final int PRINT       = 35;//("print")
    public static final int FUNC        = 36;//("func")
    public static final int RETURN      = 37;//("return")
    public static final int EOF         = 38;//("末尾")
}

每种类型代表的内容看后面的注释即可，没有值得解释的内容。然后定义Token的结构。

public class Token {
    public int type; //Token类型
    public String symbol; //Token内容，TokenType类中的注释
    public int line; //Token所在源代码的行号
    public Token(int type, String symbol, int line) {
        this.type = type;
        this.symbol = symbol;
        this.line = line;
    }
}

最后就是词法分析器，我们称之为Lexer。注释部分已经解释得很清楚了，没有什么难度。

public class Lexer {
    //关键字字典，每次从源码中取到符号后都要依此判断是否为关键字
    private Map<String, Integer> keywordsFilter;
    public Lexer() {
        //初始化关键字字典
        keywordsFilter = new HashMap<>();
        keywordsFilter.put("true", TokenType.TRUE);
        keywordsFilter.put("false", TokenType.FALSE);
        keywordsFilter.put("null", TokenType.NULL);
        keywordsFilter.put("if", TokenType.IF);
        keywordsFilter.put("else", TokenType.ELSE);
        keywordsFilter.put("while", TokenType.WHILE);
        keywordsFilter.put("break", TokenType.BREAK);
        keywordsFilter.put("continue", TokenType.CONTINUE);
        keywordsFilter.put("print", TokenType.PRINT);
        keywordsFilter.put("func", TokenType.FUNC);
        keywordsFilter.put("return", TokenType.RETURN);
    }
    public List<Token> lex(String code) {
        //该列表用于保存所有需要返回的Token
        List<Token> tokens = new ArrayList<>();
        //从源码中获取字符的索引
        int index = 0;
        //记录Token在源码中的行号
        int currentLine = 1;
        //源码的总字符长度
        int codeLength = code.length();
        while (index < codeLength) {
            //取出下一个字符，并且将索引加1
            char c = code.charAt(index++);
            //如果是空格，回车，制表符号直接跳过并进入下一次循环
            if (c == ' ' || c == '\r' || c == '\t') continue;
            //如果是换行符则将当前行号加1并进入下一次循环
            if (c == '\n') {
                currentLine++;
                continue;
            }
            if (c == '+') {
                if (index < codeLength && code.charAt(index) == '+') {
                    index++;
                    tokens.add(new Token(TokenType.PLUSPLUS, "++", currentLine));
                } else {
                    tokens.add(new Token(TokenType.PLUS, "+", currentLine));
                }
            } else if (c == '-') {
                if (index < codeLength && code.charAt(index) == '-') {
                    index++;
                    tokens.add(new Token(TokenType.MINUSMINUS, "--", currentLine));
                } else {
                    tokens.add(new Token(TokenType.MINUS, "-", currentLine));
                }
            } else if (c == '*') {
                tokens.add(new Token(TokenType.ASTERISK, "*", currentLine));
            } else if (c == '/') {
                if (index < codeLength && code.charAt(index) == '/') {//忽略注释
                    do {
                        index++;
                    } while (index < codeLength && code.charAt(index) != '\n');
                } else {
                    tokens.add(new Token(TokenType.SLASH, "/", currentLine));
                }
            } else if (c == '%') {
                tokens.add(new Token(TokenType.PERCENT, "%", currentLine));
            } else if (c == '(') {
                tokens.add(new Token(TokenType.LEFT_PAREN, "(", currentLine));
            } else if (c == ')') {
                tokens.add(new Token(TokenType.RIGHT_PAREN, ")", currentLine));
            } else if (c == '{') {
                tokens.add(new Token(TokenType.LEFT_BRACE, "{", currentLine));
            } else if (c == '}') {
                tokens.add(new Token(TokenType.RIGHT_BRACE, "}", currentLine));
            } else if (c == ',') {
                tokens.add(new Token(TokenType.COMMA, ",", currentLine));
            } else if (c == '?') {
                tokens.add(new Token(TokenType.QUESTION, "?", currentLine));
            } else if (c == ':') {
                tokens.add(new Token(TokenType.COLON, ":", currentLine));
            } else if (c == '>') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.GE, ">=", currentLine));
                } else {
                    tokens.add(new Token(TokenType.GT, ">", currentLine));
                }
            } else if (c == '<') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.LE, "<=", currentLine));
                } else {
                    tokens.add(new Token(TokenType.LT, "<", currentLine));
                }
            } else if (c == '!') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.NOT_EQUAL, "!=", currentLine));
                } else {
                    tokens.add(new Token(TokenType.BANG, "!", currentLine));
                }
            } else if (c == '|') {
                if (index < codeLength && code.charAt(index) == '|') {
                    index++;
                    tokens.add(new Token(TokenType.OR, "||", currentLine));
                } else {
                    throw new RuntimeException("Lexer Error: expect '|'");
                }
            } else if (c == '&') {
                if (index < codeLength && code.charAt(index) == '&') {
                    index++;
                    tokens.add(new Token(TokenType.AND, "&&", currentLine));
                } else {
                    throw new RuntimeException("Lexer Error: expect '&'");
                }
            } else if (c == '=') {
                if (index < codeLength && code.charAt(index) == '=') {
                    index++;
                    tokens.add(new Token(TokenType.EQUAL, "==", currentLine));
                } else {
                    tokens.add(new Token(TokenType.ASSIGN, "=", currentLine));
                }
            } else if (Character.isDigit(c)) {//数字
                int start = --index;
                do {
                    if (++index >= code.length()) break;
                    c = code.charAt(index);
                }
                while (Character.isDigit(c));
                tokens.add(new Token(TokenType.NUMBER, code.substring(start, index), currentLine));
            } else if (Character.isAlphabetic(c)) {//符号
                int start = --index;
                do {
                    if (++index >= code.length()) break;
                    c = code.charAt(index);
                }
                while (Character.isAlphabetic(c));
                String word = code.substring(start, index);
                Integer type = keywordsFilter.get(word);
                Token token = new Token(type == null ? TokenType.IDENTIFIER : type, word, currentLine);
                tokens.add(token);
            } else if (c == '"') {//字符串字面量
                int start = index;
                do {
                    if (index >= code.length()) break;
                    c = code.charAt(index++);
                    if (c == '\n') break;
                }
                while (c != '\"');
                if (c != '\"') {
                    throw new RuntimeException("Lexer Error: expect \"");
                }
                String strLiteral = code.substring(start, index-1);
                tokens.add(new Token(TokenType.STRING, strLiteral, currentLine));
            }
            else {
                throw new RuntimeException(String.format("Lexer Error: unknown character \"%c\"", c));
            }
        }
        tokens.add(new Token(TokenType.EOF, "", currentLine));
        return tokens;
    }
}

最后手动测试一下

public class Main {
    public static void main(String[] args) {
        Scanner scanner = new Scanner(System.in);
        Lexer lexer = new Lexer();
        while (true) {
            System.out.print(">>> ");
            String code = scanner.nextLine();
            if (code.equals(".q")) break;
            List<Token> tokens = lexer.lex(code);
            for (Token token : tokens) {
                System.out.println(token.symbol);
            }
        }
    }
}

如何写一个简单的解释器-1

猜你喜欢