Lexical analyzer (Lexical Analyzer)] [implemented in Python

View text content

from __future__ import print_function
import sys
 
# following two must remain in the same order
 
tk_EOI, tk_Mul, tk_Div, tk_Mod, tk_Add, tk_Sub, tk_Negate, tk_Not, tk_Lss, tk_Leq, tk_Gtr, \
tk_Geq, tk_Eq, tk_Neq, tk_Assign, tk_And, tk_Or, tk_If, tk_Else, tk_While, tk_Print,       \
tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma, tk_Ident,          \
tk_Integer, tk_String = range(31)
 
all_syms = ["End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract",
    "Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal",
    "Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or", "Keyword_if",
    "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
    "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier",
    "Integer", "String"]
 
# single character only symbols
symbols = { '{': tk_Lbrace, '}': tk_Rbrace, '(': tk_Lparen, ')': tk_Rparen, '+': tk_Add, '-': tk_Sub,
    '*': tk_Mul, '%': tk_Mod, ';': tk_Semi, ',': tk_Comma }
 
key_words = {'if': tk_If, 'else': tk_Else, 'print': tk_Print, 'putc': tk_Putc, 'while': tk_While}
 
the_ch = " "    # dummy first char - but it must be a space
the_col = 0
the_line = 1
input_file = None
 
#*** show error and exit
def error(line, col, msg):
    print(line, col, msg)
    exit(1)
 
#*** get the next character from the input
def next_ch():
    global the_ch, the_col, the_line
 
    the_ch = input_file.read(1)
    the_col += 1
    if the_ch == '\n':
        the_line += 1
        the_col = 0
    return the_ch
 
#*** 'x' - character constants
def char_lit(err_line, err_col):
    n = ord(next_ch())              # skip opening quote
    if the_ch == '\'':
        error(err_line, err_col, "empty character constant")
    elif the_ch == '\\':
        next_ch()
        if the_ch == 'n':
            n = 10
        elif the_ch == '\\':
            n = ord('\\')
        else:
            error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
    if next_ch() != '\'':
        error(err_line, err_col, "multi-character constant")
    next_ch()
    return tk_Integer, err_line, err_col, n
 
#*** process divide or comments
def div_or_cmt(err_line, err_col):
    if next_ch() != '*':
        return tk_Div, err_line, err_col
 
    # comment found
    next_ch()
    while True:
        if the_ch == '*':
            if next_ch() == '/':
                next_ch()
                return gettok()
        elif len(the_ch) == 0:
            error(err_line, err_col, "EOF in comment")
        else:
            next_ch()
 
#*** "string"
def string_lit(start, err_line, err_col):
    text = ""
 
    while next_ch() != start:
        if len(the_ch) == 0:
            error(err_line, err_col, "EOF while scanning string literal")
        if the_ch == '\n':
            error(err_line, err_col, "EOL while scanning string literal")
        text += the_ch
 
    next_ch()
    return tk_String, err_line, err_col, text
 
#*** handle identifiers and integers
def ident_or_int(err_line, err_col):
    is_number = True
    text = ""
 
    while the_ch.isalnum() or the_ch == '_':
        text += the_ch
        if not the_ch.isdigit():
            is_number = False
        next_ch()
 
    if len(text) == 0:
        error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
    if text[0].isdigit():
        if not is_number:
            error(err_line, err_col, "invalid number: %s" % (text))
        n = int(text)
        return tk_Integer, err_line, err_col, n
 
    if text in key_words:
        return key_words[text], err_line, err_col
 
    return tk_Ident, err_line, err_col, text
 
#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
    if next_ch() == expect:
        next_ch()
        return ifyes, err_line, err_col
 
    if ifno == tk_EOI:
        error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
    return ifno, err_line, err_col
 
#*** return the next token type
def gettok():
    while the_ch.isspace():
        next_ch()
 
    err_line = the_line
    err_col  = the_col
 
    if len(the_ch) == 0:    return tk_EOI, err_line, err_col
    elif the_ch == '/':     return div_or_cmt(err_line, err_col)
    elif the_ch == '\'':    return char_lit(err_line, err_col)
    elif the_ch == '<':     return follow('=', tk_Leq, tk_Lss,    err_line, err_col)
    elif the_ch == '>':     return follow('=', tk_Geq, tk_Gtr,    err_line, err_col)
    elif the_ch == '=':     return follow('=', tk_Eq,  tk_Assign, err_line, err_col)
    elif the_ch == '!':     return follow('=', tk_Neq, tk_Not,    err_line, err_col)
    elif the_ch == '&':     return follow('&', tk_And, tk_EOI,    err_line, err_col)
    elif the_ch == '|':     return follow('|', tk_Or,  tk_EOI,    err_line, err_col)
    elif the_ch == '"':     return string_lit(the_ch, err_line, err_col)
    elif the_ch in symbols:
        sym = symbols[the_ch]
        next_ch()
        return sym, err_line, err_col
    else: return ident_or_int(err_line, err_col)
 
#*** main driver
input_file = sys.stdin
if len(sys.argv) > 1:
    try:
        input_file = open(sys.argv[1], "r", 4096)
    except IOError as e:
        error(0, 0, "Can't open %s" % sys.argv[1])
 
while True:
    t = gettok()
    tok  = t[0]
    line = t[1]
    col  = t[2]
 
    print("%5d  %5d   %-14s" % (line, col, all_syms[tok]), end='')
 
    if tok == tk_Integer:  print("   %5d" % (t[3]))
    elif tok == tk_Ident:  print("  %s" %   (t[3]))
    elif tok == tk_String: print('  "%s"' % (t[3]))
    else:                  print("")
 
    if tok == tk_EOI:
        break

Output (test III)

    5     16   Keyword_print
    5     40   Op_subtract
    6     16   Keyword_putc
    6     40   Op_less
    7     16   Keyword_if
    7     40   Op_greater
    8     16   Keyword_else
    8     40   Op_lessequal
    9     16   Keyword_while
    9     40   Op_greaterequal
   10     16   LeftBrace
   10     40   Op_equal
   11     16   RightBrace
   11     40   Op_notequal
   12     16   LeftParen
   12     40   Op_and
   13     16   RightParen
   13     40   Op_or
   14     16   Op_subtract
   14     40   Semicolon
   15     16   Op_not
   15     40   Comma
   16     16   Op_multiply
   16     40   Op_assign
   17     16   Op_divide
   17     40   Integer             42
   18     16   Op_mod
   18     40   String          "String literal"
   19     16   Op_add
   19     40   Identifier      variable_name
   20     26   Integer             10
   21     26   Integer             92
   22     26   Integer             32
   23      1   End_of_input
Published 56 original articles · won praise 166 · views 60000 +

Guess you like

Origin blog.csdn.net/qq_36721220/article/details/103566791