版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/guzhou_diaoke/article/details/85670931
- token kind
class TokenKind:
EOF = 0 # end-of-file
VARARG = 2 # ...
SEP_SEMI = 3 # ;
SEP_COMMA = 4 # ,
SEP_DOT = 5 # .
SEP_COLON = 6 # :
SEP_LABEL = 7 # ::
SEP_LPAREN = 8 # (
SEP_RPAREN = 9 # )
SEP_LBRACK = 10 # [
SEP_RBRACK = 11 # ]
SEP_LCURLY = 12 # {
SEP_RCURLY = 13 # }
OP_ASSIGN = 14 # =
OP_MINUS = 15 # - (sub or unm)
OP_WAVE = 16 # ~ (bnot or bxor)
OP_ADD = 17 # +
OP_MUL = 18 # *
OP_DIV = 19 # /
OP_IDIV = 20 # #
OP_POW = 21 # ^
OP_MOD = 22 # %
OP_BAND = 23 # &
OP_BOR = 24 # |
OP_SHR = 25 # >>
OP_SHL = 26 # <<
OP_CONCAT = 27 # ..
OP_LT = 28 # <
OP_LE = 29 # <=
OP_GT = 30 # >
OP_GE = 31 # >=
OP_EQ = 32 # ==
OP_NE = 33 # ~=
OP_LEN = 34 # #
OP_AND = 35 # and
OP_OR = 36 # or
OP_NOT = 37 # not
KW_BREAK = 38 # break
KW_DO = 39 # do
KW_ELSE = 40 # else
KW_ELSEIF = 41 # elseif
KW_END = 42 # end
KW_FALSE = 43 # false
KW_FOR = 44 # for
KW_FUNCTION = 45 # function
KW_GOTO = 46 # goto
KW_IF = 47 # if
KW_IN = 48 # in
KW_LOCAL = 49 # local
KW_NIL = 50 # nil
KW_REPEAT = 51 # repeat
KW_RETURN = 52 # ret rn
KW_THEN = 53 # then
KW_TRUE = 54 # true
KW_UNTIL = 55 # until
KW_WHILE = 56 # while
IDENTIFIER = 57 # identifier
NUMBER = 58 # number literal
STRING = 59 # string literal
OP_UNM = 60 # = TOKEN_OP_MINUS # unary minus
OP_SUB = 61 # = TOKEN_OP_MINUS
OP_BNOT = 62 # = TOKEN_OP_WAVE
OP_BXOR = 63 # = TOKEN_OP_WAVE
- token
keywords = {
"and": TokenKind.OP_AND,
"break": TokenKind.KW_BREAK,
"do": TokenKind.KW_DO,
"else": TokenKind.KW_ELSE,
"elseif": TokenKind.KW_ELSEIF,
"end": TokenKind.KW_END,
"false": TokenKind.KW_FALSE,
"for": TokenKind.KW_FOR,
"function": TokenKind.KW_FUNCTION,
"goto": TokenKind.KW_GOTO,
"if": TokenKind.KW_IF,
"in": TokenKind.KW_IN,
"local": TokenKind.KW_LOCAL,
"nil": TokenKind.KW_NIL,
"not": TokenKind.OP_NOT,
"or": TokenKind.OP_OR,
"repeat": TokenKind.KW_REPEAT,
"return": TokenKind.KW_RETURN,
"then": TokenKind.KW_THEN,
"true": TokenKind.KW_TRUE,
"until": TokenKind.KW_UNTIL,
"while": TokenKind.KW_WHILE,
}
single_tokens = {
';': TokenKind.SEP_SEMI,
',': TokenKind.SEP_COMMA,
'(': TokenKind.SEP_LPAREN,
')': TokenKind.SEP_RPAREN,
']': TokenKind.SEP_RBRACK,
'{': TokenKind.SEP_LCURLY,
'}': TokenKind.SEP_RCURLY,
'+': TokenKind.OP_ADD,
'-': TokenKind.OP_MINUS,
'*': TokenKind.OP_MUL,
'^': TokenKind.OP_POW,
'%': TokenKind.OP_MOD,
'&': TokenKind.OP_BAND,
'|': TokenKind.OP_BOR,
'#': TokenKind.OP_LEN,
}
def kind_to_category(kind):
if kind < TokenKind.SEP_SEMI:
return "other"
if kind <= TokenKind.SEP_RCURLY:
return "separator"
if kind <= TokenKind.OP_NOT:
return "operator"
if kind <= TokenKind.KW_WHILE:
return "keyword"
if kind <= TokenKind.IDENTIFIER:
return "identifier"
if kind <= TokenKind.NUMBER:
return "number"
if kind <= TokenKind.STRING:
return "string"
return "other"
class Token:
def __init__(self, line, kind, value):
self.line = line
self.kind = kind
self.value = value
- get next token
class Lexer:
re_new_line = re.compile(r"\r\n|\n\r|\n|\r")
re_identifier = r"^[_\d\w]+"
re_opening_long_bracket = r"^\[=*\["
re_short_string = r"(?s)(^'(\\\\|\\'|\\\n|\\z\s*|[^'\n])*')|(^\"(\\\\|\\\"|\\\n|\\z\s*|[^\"\n])*\")"
re_number = r"^0[xX][0-9a-fA-F]*(\.[0-9a-fA-F]*)?([pP][+\-]?[0-9]+)?|^[0-9]*(\.[0-9]*)?([eE][+\-]?[0-9]+)?"
re_dec_escape_seq = r"^\\[0-9]{1,3}"
re_hex_escape_seq = r"^\\x[0-9a-fA-F]{2}"
re_unicode_escape_seq = r"^\\u\{[0-9a-fA-F]+\}"
def __init__(self, chunk, chunk_name):
self.chunk = chunk
self.chunk_name = chunk_name
self.line = 1
self.next_token = None
self.next_token_kind = None
self.next_token_line = 0
def get_next_token(self):
self.skip_space()
if len(self.chunk) == 0:
return self.line, TokenKind.EOF, 'EOF'
c = self.chunk[0]
if c in single_tokens:
self.next(1)
return self.line, single_tokens[c], c
if c == ':':
if self.test("::"):
self.next(2)
return self.line, TokenKind.SEP_LABEL, '::'
else:
self.next(1)
return self.line, TokenKind.SEP_COLON, c
if c == '/':
if self.test('//'):
self.next(2)
return self.line, TokenKind.OP_IDIV, '//'
else:
self.next(1)
return self.line, TokenKind.OP_DIV, c
if c == '~':
if self.test('~='):
self.next(2)
return self.line, TokenKind.OP_NE, '~='
else:
self.next(1)
return self.line, TokenKind.OP_WAVE, c
if c == '=':
if self.test('=='):
self.next(2)
return self.line, TokenKind.OP_EQ, '=='
else:
self.next(1)
return self.line, TokenKind.OP_ASSIGN, c
if c == '<':
if self.test('<<'):
self.next(2)
return self.line, TokenKind.OP_SHL, '<<'
elif self.test('<='):
self.next(2)
return self.line, TokenKind.OP_LE, '<='
else:
self.next(1)
return self.line, TokenKind.OP_LT, c
if c == '>':
if self.test('>>'):
self.next(2)
return self.line, TokenKind.OP_SHR, '<<'
elif self.test('>='):
self.next(2)
return self.line, TokenKind.OP_GE, '>='
else:
self.next(1)
return self.line, TokenKind.OP_LT, c
if c == '.':
if self.test('...'):
self.next(3)
return self.line, TokenKind.VARARG, '...'
elif self.test('..'):
self.next(2)
return self.line, TokenKind.OP_CONCAT, '..'
elif len(self.chunk) == 1 or not self.chunk[1].isdigit():
self.next(1)
return self.line, TokenKind.SEP_DOT, c
if c == '[':
if self.test('[[') or self.test('[='):
return self.line, TokenKind.STRING, self.scan_long_string()
else:
self.next(1)
return self.line, TokenKind.SEP_LBRACK, '['
if c in ('\'', '"'):
return self.line, TokenKind.STRING, self.scan_short_string()
if c == '.' or c.isdigit():
token = self.scan_number()
return self.line, TokenKind.NUMBER, token
if c == '_' or c.isalpha():
token = self.scan_identifier()
if token in keywords:
return self.line, keywords[token], token
else:
return self.line, TokenKind.IDENTIFIER, token
self.error('unexpected symbol near "%s"', c)
def next(self, n):
self.chunk = self.chunk[n:]
def test(self, s):
return self.chunk.startswith(s)
def error(self, f, *args):
err = f.format(*args)
err = '{0}:{1}: {2}'.format(self.chunk_name, self.line, err)
raise Exception(err)
- skip space
def skip_space(self):
while len(self.chunk) > 0:
if self.test('--'):
self.skip_comment()
elif self.test('\r\n') or self.test('\n\r'):
self.next(2)
self.line += 1
elif Lexer.is_new_line(self.chunk[0]):
self.next(1)
self.line += 1
elif Lexer.is_white_space(self.chunk[0]):
self.next(1)
else:
break
@staticmethod
def is_white_space(c):
return c in ('\t', '\n', '\v', '\f', '\r', ' ')
@staticmethod
def is_new_line(c):
return c in ('\r', '\n')
def skip_comment(self):
self.next(2)
if self.test('['):
if re.match(Lexer.re_opening_long_bracket, self.chunk):
self.scan_long_string()
return
while len(self.chunk) > 0 and not self.is_new_line(self.chunk[0]):
self.next(1)
- short string
def scan_short_string(self):
m = re.match(Lexer.re_short_string, self.chunk)
if m:
s = m.group()
self.next(len(s))
s = s[1: len(s)-1]
if s.find('\\') >= 0:
result = Lexer.re_new_line.findall(s)
self.line += len(result)
s = self.escape(s)
return s
self.error('unfinished string')
return ''
def escape(self, s):
ret = ''
while len(s) > 0:
if s[0] != '\\':
ret += s[0]
s = s[1:]
continue
if len(s) == 1:
self.error('unfinished string')
if s[1] == 'a':
ret += '\a'
s = s[2:]
continue
elif s[1] == 'b':
ret += '\b'
s = s[2:]
continue
elif s[1] == 'f':
ret += '\f'
s = s[2:]
continue
elif s[1] == 'n' or s[1] == '\n':
ret += '\n'
s = s[2:]
continue
elif s[1] == 'r':
ret += '\r'
s = s[2:]
continue
elif s[1] == 't':
ret += '\t'
s = s[2:]
continue
elif s[1] == 'v':
ret += '\v'
s = s[2:]
continue
elif s[1] == '"':
ret += '"'
s = s[2:]
continue
elif s[1] == '\'':
ret += '\''
s = s[2:]
continue
elif s[1] == '\\':
ret += '\\'
s = s[2:]
continue
elif s[1] in '0123456789':
m = re.match(Lexer.re_dec_escape_seq, s)
if m:
str_dec = m.group()[1:]
d = int(str_dec)
if d <= 0xff:
ret += str(chr(d))
s = s[len(m.group()):]
continue
self.error('decimal escape too large near "%s"' % m.group())
elif s[1] == 'x':
m = re.match(Lexer.re_hex_escape_seq, s)
if m:
str_hex = '0' + m.group()[1:]
d = int(str_hex, 16)
ret += str(chr(d))
s = s[len(m.group()):]
continue
elif s[1] == 'u':
m = re.match(Lexer.re_unicode_escape_seq, s)
if m:
str_unicode = m.group()[3: len(m.group())-1]
d = int(str_unicode, 16)
if d <= 0x10ffff:
ret += str(chr(d))
s = s[len(m.group()):]
continue
self.error('UTF-8 value too large near "%s"' % str_unicode)
elif s[1] == 'z':
s = s[2:]
while len(s) > 0 and Lexer.is_white_space(s[0]):
s = s[1:]
continue
return ret
- long string
def scan_long_string(self):
m_open = re.match(Lexer.re_opening_long_bracket, self.chunk)
if m_open is None:
self.error('invalid long string delimiter near "%s"' % self.chunk[0:2])
str_open = m_open.group()
str_close = str_open.replace('[', ']')
close_idx = self.chunk.find(str_close)
if close_idx < 0:
self.error('unfinished long string or comment')
s = self.chunk[len(str_open): close_idx]
self.next(close_idx + len(str_close))
s = re.sub(Lexer.re_new_line, s, '\n')
self.line += s.count('\n')
if len(s) > 0 and s[0] == '\n':
s = s[1:]
return s
- number and identifier
def scan_identifier(self):
return self.scan(Lexer.re_identifier)
def scan_number(self):
return self.scan(Lexer.re_number)
def scan(self, pattern):
m = re.match(pattern, self.chunk)
if m:
token = m.group()
self.next(len(token))
return token
raise Exception('unreachable')
- test
print("hello") -- short comment
print("world") --> another short comment
print() --[[ long comment ]]
--[===[
another
long comment
]===]
print("hello, \z
world!") --> hello, world!
a = 'alo\n123"'
a = "alo\n123\""
a = '\97lo\10\04923"'
a = [[alo
123"]]
a = [==[
alo
123"]==]
a = 123.456e78
a = 'ação'
a = '\x66'
a = '\u{03a9}\u{03a8}'
a = [======aaa
import sys
from lexer import Lexer
from lua_token import *
def main():
with open(sys.argv[1], 'r') as f:
data = f.read()
lexer = Lexer(data, sys.argv[1])
while True:
try:
line, kind, token = lexer.get_next_token()
print('[%2d] [%-10s] %s' % (line, kind_to_category(kind), token))
if kind == TokenKind.EOF:
break
except Exception as e:
sys.exit(e)
if __name__ == '__main__':
if len(sys.argv) == 2:
main()
else:
print('Error argument')
- result
[ 1] [identifier] print
[ 1] [separator ] (
[ 1] [string ] hello
[ 1] [separator ] )
[ 2] [identifier] print
[ 2] [separator ] (
[ 2] [string ] world
[ 2] [separator ] )
[ 3] [identifier] print
[ 3] [separator ] (
[ 3] [separator ] )
[ 9] [identifier] print
[ 9] [separator ] (
[ 9] [string ] hello, world!
[10] [separator ] )
[12] [identifier] a
[12] [operator ] =
[12] [string ] alo
123"
[13] [identifier] a
[13] [operator ] =
[13] [string ] alo
123"
[14] [identifier] a
[14] [operator ] =
[14] [string ] alo
123"
[15] [identifier] a
[15] [operator ] =
[15] [string ] alo
123"
[17] [identifier] a
[17] [operator ] =
[17] [string ] alo
123"
[20] [identifier] a
[20] [operator ] =
[20] [number ] 123.456e78
[21] [identifier] a
[21] [operator ] =
[21] [string ] ação
[22] [identifier] a
[22] [operator ] =
[22] [string ] f
[23] [identifier] a
[23] [operator ] =
[23] [string ] ΩΨ
[24] [identifier] a
[24] [operator ] =
./test/lexer.lua:24: invalid long string delimiter near "[="