luapy (13) lua lexer

token kind

class TokenKind:
    EOF = 0            # end-of-file
    VARARG = 2         # ...
    SEP_SEMI = 3       # ;
    SEP_COMMA = 4      # ,
    SEP_DOT = 5        # .
    SEP_COLON = 6      # :
    SEP_LABEL = 7      # ::
    SEP_LPAREN = 8     # (
    SEP_RPAREN = 9     # )
    SEP_LBRACK = 10    # [
    SEP_RBRACK = 11    # ]
    SEP_LCURLY = 12    # {
    SEP_RCURLY = 13    # }
    OP_ASSIGN = 14     # =
    OP_MINUS = 15      # - (sub or unm)
    OP_WAVE = 16       # ~ (bnot or bxor)
    OP_ADD = 17        # +
    OP_MUL = 18        # *
    OP_DIV = 19        # /
    OP_IDIV = 20       # #
    OP_POW = 21        # ^
    OP_MOD = 22        # %
    OP_BAND = 23       # &
    OP_BOR = 24        # |
    OP_SHR = 25        # >>
    OP_SHL = 26        # <<
    OP_CONCAT = 27     # ..
    OP_LT = 28         # <
    OP_LE = 29         # <=
    OP_GT = 30         # >
    OP_GE = 31         # >=
    OP_EQ = 32         # ==
    OP_NE = 33         # ~=
    OP_LEN = 34        # #
    OP_AND = 35        # and
    OP_OR = 36         # or
    OP_NOT = 37        # not
    KW_BREAK = 38      # break
    KW_DO = 39         # do
    KW_ELSE = 40       # else
    KW_ELSEIF = 41     # elseif
    KW_END = 42        # end
    KW_FALSE = 43      # false
    KW_FOR = 44        # for
    KW_FUNCTION = 45   # function
    KW_GOTO = 46       # goto
    KW_IF = 47         # if
    KW_IN = 48         # in
    KW_LOCAL = 49      # local
    KW_NIL = 50        # nil
    KW_REPEAT = 51     # repeat
    KW_RETURN = 52     # ret rn
    KW_THEN = 53       # then
    KW_TRUE = 54       # true
    KW_UNTIL = 55      # until
    KW_WHILE = 56      # while
    IDENTIFIER = 57    # identifier
    NUMBER = 58        # number literal
    STRING = 59        # string literal
    OP_UNM = 60        # = TOKEN_OP_MINUS # unary minus
    OP_SUB = 61        # = TOKEN_OP_MINUS
    OP_BNOT = 62       # = TOKEN_OP_WAVE
    OP_BXOR = 63       # = TOKEN_OP_WAVE

token

keywords = {
    "and":      TokenKind.OP_AND,
    "break":    TokenKind.KW_BREAK,
    "do":       TokenKind.KW_DO,
    "else":     TokenKind.KW_ELSE,
    "elseif":   TokenKind.KW_ELSEIF,
    "end":      TokenKind.KW_END,
    "false":    TokenKind.KW_FALSE,
    "for":      TokenKind.KW_FOR,
    "function": TokenKind.KW_FUNCTION,
    "goto":     TokenKind.KW_GOTO,
    "if":       TokenKind.KW_IF,
    "in":       TokenKind.KW_IN,
    "local":    TokenKind.KW_LOCAL,
    "nil":      TokenKind.KW_NIL,
    "not":      TokenKind.OP_NOT,
    "or":       TokenKind.OP_OR,
    "repeat":   TokenKind.KW_REPEAT,
    "return":   TokenKind.KW_RETURN,
    "then":     TokenKind.KW_THEN,
    "true":     TokenKind.KW_TRUE,
    "until":    TokenKind.KW_UNTIL,
    "while":    TokenKind.KW_WHILE,
}


single_tokens = {
    ';':        TokenKind.SEP_SEMI,
    ',':        TokenKind.SEP_COMMA,
    '(':        TokenKind.SEP_LPAREN,
    ')':        TokenKind.SEP_RPAREN,
    ']':        TokenKind.SEP_RBRACK,
    '{':        TokenKind.SEP_LCURLY,
    '}':        TokenKind.SEP_RCURLY,
    '+':        TokenKind.OP_ADD,
    '-':        TokenKind.OP_MINUS,
    '*':        TokenKind.OP_MUL,
    '^':        TokenKind.OP_POW,
    '%':        TokenKind.OP_MOD,
    '&':        TokenKind.OP_BAND,
    '|':        TokenKind.OP_BOR,
    '#':        TokenKind.OP_LEN,
}


def kind_to_category(kind):
    if kind < TokenKind.SEP_SEMI:
        return "other"
    if kind <= TokenKind.SEP_RCURLY:
        return "separator"
    if kind <= TokenKind.OP_NOT:
        return "operator"
    if kind <= TokenKind.KW_WHILE:
        return "keyword"
    if kind <= TokenKind.IDENTIFIER:
        return "identifier"
    if kind <= TokenKind.NUMBER:
        return "number"
    if kind <= TokenKind.STRING:
        return "string"
    return "other"


class Token:
    def __init__(self, line, kind, value):
        self.line = line
        self.kind = kind
        self.value = value

get next token

class Lexer:
    re_new_line = re.compile(r"\r\n|\n\r|\n|\r")
    re_identifier = r"^[_\d\w]+"
    re_opening_long_bracket = r"^\[=*\["
    re_short_string = r"(?s)(^'(\\\\|\\'|\\\n|\\z\s*|[^'\n])*')|(^\"(\\\\|\\\"|\\\n|\\z\s*|[^\"\n])*\")"
    re_number = r"^0[xX][0-9a-fA-F]*(\.[0-9a-fA-F]*)?([pP][+\-]?[0-9]+)?|^[0-9]*(\.[0-9]*)?([eE][+\-]?[0-9]+)?"

    re_dec_escape_seq = r"^\\[0-9]{1,3}"
    re_hex_escape_seq = r"^\\x[0-9a-fA-F]{2}"
    re_unicode_escape_seq = r"^\\u\{[0-9a-fA-F]+\}"

    def __init__(self, chunk, chunk_name):
        self.chunk = chunk
        self.chunk_name = chunk_name
        self.line = 1
        self.next_token = None
        self.next_token_kind = None
        self.next_token_line = 0

    def get_next_token(self):
        self.skip_space()
        if len(self.chunk) == 0:
            return self.line, TokenKind.EOF, 'EOF'

        c = self.chunk[0]
        if c in single_tokens:
            self.next(1)
            return self.line, single_tokens[c], c
        if c == ':':
            if self.test("::"):
                self.next(2)
                return self.line, TokenKind.SEP_LABEL, '::'
            else:
                self.next(1)
                return self.line, TokenKind.SEP_COLON, c
        if c == '/':
            if self.test('//'):
                self.next(2)
                return self.line, TokenKind.OP_IDIV, '//'
            else:
                self.next(1)
                return self.line, TokenKind.OP_DIV, c
        if c == '~':
            if self.test('~='):
                self.next(2)
                return self.line, TokenKind.OP_NE, '~='
            else:
                self.next(1)
                return self.line, TokenKind.OP_WAVE, c
        if c == '=':
            if self.test('=='):
                self.next(2)
                return self.line, TokenKind.OP_EQ, '=='
            else:
                self.next(1)
                return self.line, TokenKind.OP_ASSIGN, c
        if c == '<':
            if self.test('<<'):
                self.next(2)
                return self.line, TokenKind.OP_SHL, '<<'
            elif self.test('<='):
                self.next(2)
                return self.line, TokenKind.OP_LE, '<='
            else:
                self.next(1)
                return self.line, TokenKind.OP_LT, c
        if c == '>':
            if self.test('>>'):
                self.next(2)
                return self.line, TokenKind.OP_SHR, '<<'
            elif self.test('>='):
                self.next(2)
                return self.line, TokenKind.OP_GE, '>='
            else:
                self.next(1)
                return self.line, TokenKind.OP_LT, c
        if c == '.':
            if self.test('...'):
                self.next(3)
                return self.line, TokenKind.VARARG, '...'
            elif self.test('..'):
                self.next(2)
                return self.line, TokenKind.OP_CONCAT, '..'
            elif len(self.chunk) == 1 or not self.chunk[1].isdigit():
                self.next(1)
                return self.line, TokenKind.SEP_DOT, c
        if c == '[':
            if self.test('[[') or self.test('[='):
                return self.line, TokenKind.STRING, self.scan_long_string()
            else:
                self.next(1)
                return self.line, TokenKind.SEP_LBRACK, '['
        if c in ('\'', '"'):
            return self.line, TokenKind.STRING, self.scan_short_string()

        if c == '.' or c.isdigit():
            token = self.scan_number()
            return self.line, TokenKind.NUMBER, token
        if c == '_' or c.isalpha():
            token = self.scan_identifier()
            if token in keywords:
                return self.line, keywords[token], token
            else:
                return self.line, TokenKind.IDENTIFIER, token

        self.error('unexpected symbol near "%s"', c)

    def next(self, n):
        self.chunk = self.chunk[n:]

    def test(self, s):
        return self.chunk.startswith(s)

    def error(self, f, *args):
        err = f.format(*args)
        err = '{0}:{1}: {2}'.format(self.chunk_name, self.line, err)
        raise Exception(err)

skip space

    def skip_space(self):
        while len(self.chunk) > 0:
            if self.test('--'):
                self.skip_comment()
            elif self.test('\r\n') or self.test('\n\r'):
                self.next(2)
                self.line += 1
            elif Lexer.is_new_line(self.chunk[0]):
                self.next(1)
                self.line += 1
            elif Lexer.is_white_space(self.chunk[0]):
                self.next(1)
            else:
                break

    @staticmethod
    def is_white_space(c):
        return c in ('\t', '\n', '\v', '\f', '\r', ' ')

    @staticmethod
    def is_new_line(c):
        return c in ('\r', '\n')

    def skip_comment(self):
        self.next(2)

        if self.test('['):
            if re.match(Lexer.re_opening_long_bracket, self.chunk):
                self.scan_long_string()
                return

        while len(self.chunk) > 0 and not self.is_new_line(self.chunk[0]):
            self.next(1)

short string

    def scan_short_string(self):
        m = re.match(Lexer.re_short_string, self.chunk)
        if m:
            s = m.group()
            self.next(len(s))
            s = s[1: len(s)-1]
            if s.find('\\') >= 0:
                result = Lexer.re_new_line.findall(s)
                self.line += len(result)
                s = self.escape(s)
            return s
        self.error('unfinished string')
        return ''

    def escape(self, s):
        ret = ''
        while len(s) > 0:
            if s[0] != '\\':
                ret += s[0]
                s = s[1:]
                continue

            if len(s) == 1:
                self.error('unfinished string')

            if s[1] == 'a':
                ret += '\a'
                s = s[2:]
                continue
            elif s[1] == 'b':
                ret += '\b'
                s = s[2:]
                continue
            elif s[1] == 'f':
                ret += '\f'
                s = s[2:]
                continue
            elif s[1] == 'n' or s[1] == '\n':
                ret += '\n'
                s = s[2:]
                continue
            elif s[1] == 'r':
                ret += '\r'
                s = s[2:]
                continue
            elif s[1] == 't':
                ret += '\t'
                s = s[2:]
                continue
            elif s[1] == 'v':
                ret += '\v'
                s = s[2:]
                continue
            elif s[1] == '"':
                ret += '"'
                s = s[2:]
                continue
            elif s[1] == '\'':
                ret += '\''
                s = s[2:]
                continue
            elif s[1] == '\\':
                ret += '\\'
                s = s[2:]
                continue
            elif s[1] in '0123456789':
                m = re.match(Lexer.re_dec_escape_seq, s)
                if m:
                    str_dec = m.group()[1:]
                    d = int(str_dec)
                    if d <= 0xff:
                        ret += str(chr(d))
                        s = s[len(m.group()):]
                        continue
                    self.error('decimal escape too large near "%s"' % m.group())
            elif s[1] == 'x':
                m = re.match(Lexer.re_hex_escape_seq, s)
                if m:
                    str_hex = '0' + m.group()[1:]
                    d = int(str_hex, 16)
                    ret += str(chr(d))
                    s = s[len(m.group()):]
                    continue
            elif s[1] == 'u':
                m = re.match(Lexer.re_unicode_escape_seq, s)
                if m:
                    str_unicode = m.group()[3: len(m.group())-1]
                    d = int(str_unicode, 16)
                    if d <= 0x10ffff:
                        ret += str(chr(d))
                        s = s[len(m.group()):]
                        continue
                    self.error('UTF-8 value too large near "%s"' % str_unicode)
            elif s[1] == 'z':
                s = s[2:]
                while len(s) > 0 and Lexer.is_white_space(s[0]):
                    s = s[1:]
                continue
        return ret

long string

    def scan_long_string(self):
        m_open = re.match(Lexer.re_opening_long_bracket, self.chunk)
        if m_open is None:
            self.error('invalid long string delimiter near "%s"' % self.chunk[0:2])

        str_open = m_open.group()
        str_close = str_open.replace('[', ']')
        close_idx = self.chunk.find(str_close)
        if close_idx < 0:
            self.error('unfinished long string or comment')

        s = self.chunk[len(str_open): close_idx]
        self.next(close_idx + len(str_close))

        s = re.sub(Lexer.re_new_line, s, '\n')
        self.line += s.count('\n')
        if len(s) > 0 and s[0] == '\n':
            s = s[1:]

        return s

number and identifier

    def scan_identifier(self):
        return self.scan(Lexer.re_identifier)

    def scan_number(self):
        return self.scan(Lexer.re_number)

    def scan(self, pattern):
        m = re.match(pattern, self.chunk)
        if m:
            token = m.group()
            self.next(len(token))
            return token
        raise Exception('unreachable')

test

print("hello") -- short comment
print("world") --> another short comment
print() --[[ long comment ]]
--[===[
  another
  long comment
]===]

print("hello, \z
       world!") --> hello, world!

a = 'alo\n123"'
a = "alo\n123\""
a = '\97lo\10\04923"'
a = [[alo
123"]]
a = [==[
alo
123"]==]
a = 123.456e78
a = 'ação'
a = '\x66'
a = '\u{03a9}\u{03a8}'
a = [======aaa

import sys
from lexer import Lexer
from lua_token import *


def main():
    with open(sys.argv[1], 'r') as f:
        data = f.read()
        lexer = Lexer(data, sys.argv[1])
        while True:
            try:
                line, kind, token = lexer.get_next_token()
                print('[%2d] [%-10s] %s' % (line, kind_to_category(kind), token))
                if kind == TokenKind.EOF:
                    break
            except Exception as e:
                sys.exit(e)


if __name__ == '__main__':
    if len(sys.argv) == 2:
        main()
    else:
        print('Error argument')

result

[ 1] [identifier] print
[ 1] [separator ] (
[ 1] [string    ] hello
[ 1] [separator ] )
[ 2] [identifier] print
[ 2] [separator ] (
[ 2] [string    ] world
[ 2] [separator ] )
[ 3] [identifier] print
[ 3] [separator ] (
[ 3] [separator ] )
[ 9] [identifier] print
[ 9] [separator ] (
[ 9] [string    ] hello, world!
[10] [separator ] )
[12] [identifier] a
[12] [operator  ] =
[12] [string    ] alo
123"
[13] [identifier] a
[13] [operator  ] =
[13] [string    ] alo
123"
[14] [identifier] a
[14] [operator  ] =
[14] [string    ] alo
123"
[15] [identifier] a
[15] [operator  ] =
[15] [string    ] alo
123"
[17] [identifier] a
[17] [operator  ] =
[17] [string    ] alo
123"
[20] [identifier] a
[20] [operator  ] =
[20] [number    ] 123.456e78
[21] [identifier] a
[21] [operator  ] =
[21] [string    ] ação
[22] [identifier] a
[22] [operator  ] =
[22] [string    ] f
[23] [identifier] a
[23] [operator  ] =
[23] [string    ] ΩΨ
[24] [identifier] a
[24] [operator  ] =
./test/lexer.lua:24: invalid long string delimiter near "[="

猜你喜欢