Verbatim https://www.dazhuanlan.com/2019/08/25/5d62598fd13ee/
In learning compiler theory curriculum design, the need to design a lexical Xu process. Python then try to achieve simple C language lexical analyzer. Which does not provide any specific difficulties to be overcome, as long as part of the lexical analysis DFA good design, implement ideas will be clearer.
1 Introduction
C language, we need to extract keywords, identifiers, delimiters, operators, constants of different data types. Wherein the identifier, the identifier and the extracted separator simpler, because the operators of each composition having a different meaning operator needs detailed breakdown, for example: >>,> =, = >>, the constant is the most vexing, For example: integer octal 00123u hexadecimal 0x23fflu, float 123.434f index 0123e-002.
Lexical analyzer can display the number of rows required to detect the error location and the error is skipped to continue the analysis. In addition, we also need to single-line comments, multiline comments were skipped, and skipped macro temporarily unable to process definition, pre-compiled.
2, part of a finite state machine implementation
2.1, keywords and identifiers
if prog [flag] .isalpha () or prog [flag] == r '_': #_ identifiers beginning with a letter key and tmp = In Flag the while PROG [In Flag] .isalnum () or PROG [In Flag] R & lt == '_':. 1 in Flag = + # rightmost come Matching the fragment = STR (the reduce (the lambda X, Y: X + Y, PROG [tmp: in Flag])) IF keyword in the fragment: # determines a keyword, keyword keywords List Print ( "<" the fragment + + ", ->", End = '') the else: # determines identifier print ( "<IDENTIFY," + fragment + ">", end = '')
2.2 Digital (most complex)
#prog be analyzed string flag to mark the current line of the current row position on a line feed position befspace
#skip (prog, flag) after an error skip invalid characters
elif prog [flag] .isdigit () : # numeric constants determined
if prog [flag] == '0 ': # when the start character is 0
IF PROG [In Flag +. 1] .lower () == 'X': # 0x beginning hexadecimal
In Flag + 2 =
tmp = In Flag
the while PROG [In Flag] .isdigit () or
(PROG [In Flag] .lower ()> = 'A' and PROG [In Flag] .lower () <= 'F' ): # Analyzing hex
In Flag + =. 1
numch = STR (the reduce (the lambda X, Y: X + Y, PROG [tmp: In Flag])) # splicing hexadecimal number
num = str (int (numch, 16 )) # spliced to a decimal number
if prog [flag] .lower () == 'u': # determines whether a signed long integer, after the current point to hexadecimal a flag
if prog [flag + 1] .lower () == 'L': # 0x23ul
IF PROG [In Flag + 2] .isalnum (): # 0x23uls error, and skip
print ( "<ERROR, INT_UN_LONG_DEFINE, LINE" + str (line) + "" + str ( befspace-In Flag) + ">", End = '')
In Flag + =. 3
In Flag = Skip (PROG, In Flag)
the else:
Print ( "<INT_UN_LONG," NUM + + ">", End = '')
flag+=2
elif prog[flag+1].isalnum():#0x23u2出错
print(“<ERROR,INT_UN_DEFINE,LINE “+str(line)+” “+str(flag-befspace)+” >”, end=’ ‘)
flag += 1
flag=skip(prog,flag)
else:
print(“< INT_UN,”+num+” >”,end=’ ‘)#
flag+=1
elif prog[flag].lower()==’l’:#同上
if prog[flag+1].lower()==’u’:
if prog[flag + 2].isalnum():
print(“< ERROR,INT_UN_LONG_DEFINE ,LINE “+str(line)+” “+str(flag-befspace)+” >”, end=’ ‘)
flag += 3
flag = skip(prog, flag)
else:
print(“< INT_UN_LONG,”+num+” >”,end=’ ‘)
flag+=2
elif prog[flag+1].isalnum():
print(“< ERROR,INT_LONG_DEFINE,LINE “+str(line)+str(flag-befspace)+” >”, end=’ ‘)
+ =. 1 In Flag
In Flag = Skip (PROG, In Flag)
the else:
Print ( "<INT_LONG," NUM + + ">", End = '')
In Flag + =. 1
elif PROG [In Flag] .isalpha (): # Hex after receiving the character is given
Print ( "<ERROR, INT_DEFINE, the LINE" + STR (Line) + "" + STR (In Flag-befspace) + ">", End = '')
In Flag + =. 1
In Flag = Skip (PROG, In Flag )
the else:
Print ( "<the INT," NUM + + ">", End = '')
Pass
elif PROG [In Flag +. 1] == '':.. # 0 at the beginning of the floating-point
tmp = In Flag
In Flag = 2 +
haspro = False # whether there are +, -, E
hasneg = False
Hasee = False
the while PROG [In Flag] .isdigit () or PROG [In Flag] .lower () == 'E' or PROG [In Flag] == '-' or PROG [In Flag] == '+':
IF (PROG [In Flag] == '+' or PROG [In Flag] == '-') and Not Hasee: # must meet a number of conditions BREAK
elif (prog[flag] == ‘+’ or prog[flag] == ‘-‘) and (haspro or hasneg):break
elif (prog[flag]==’+’ or prog[flag]==’-‘) and haseE:
if prog[flag-1].lower()!=’e’:break
else:
if prog[flag]==’+’:haspro=True
else:hasneg=True
elif prog[flag].lower()==’e’ and haseE:break
elif prog[flag]==’+’:haspro=True
elif prog[flag] ==’-‘:hasneg = True
elif prog[flag].lower()==’e’:haseE=True
flag+=1
befpron=str(reduce(lambda x,y:x+y,prog[tmp:flag]))#拼接
findpro=befpron.find(‘+’)
findneg=befpron.find(‘-‘)
findeE=befpron.lower().find(‘e’)
IsFLOAT=False
IsERROR=False
theskip=flag
if prog[flag].lower()==’f’:#0.23e+002f浮点数声明判断
if prog[flag+1].isalnum():#0.23e+002f3报错
print(“< ERROR,FLOAT_DEFINE,LINE “ + str(line) + “ “ + str(flag+1- befspace) + “ >”,
end=’ ‘)
IsERROR=True
flag = skip(prog, flag+2)
else:
IsFLOAT=True
theskip=flag+1
elif prog[flag].isalnum():#同上
print(“< ERROR,FLOAT_DEFINE,LINE “ + str(line) + “ “ + str(flag-befspace) + “ >”,
end=’ ‘)
IsERROR = True
flag = skip(prog, flag+1)
if not IsERROR:
if (hasneg or haspro) and haseE:#有符号指数型
pos=(findpro if haspro>hasneg else findneg)
if pos!=flag-tmp-1:#处理e+23
suffix=int(reduce(lambda x,y:str(int(x)10+int(y)),befpron[pos+1:]))
suffix=suffix if haspro else -suffix
num=str(float(str(reduce(lambda x,y:x+y,befpron[0:findeE])))10**suffix)
if IsFLOAT:
print(“< FLOAT,” + num + “ >”, end=’ ‘)
flag=theskip
else:print(“< DOUBLE,” + num + “ >”, end=’ ‘)
else:#0.12e+报错
print(“< ERROR,FLOAT_DEFINE,LINE “ + str(line) + “ “ + str(flag - befspace) + “ >”,
end=’ ‘)
flag = skip(prog, flag)
elif haseE and not (haspro or hasneg):#无符号指数e
if findeE!=flag-tmp-1:
suffix = int(str(reduce(lambda x, y: x + y, befpron[findeE+1:])))
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE])))
* 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=' ')
flag=theskip
else:print("< DOUBLE," + num + " >", end=' ')
else:
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
flag = skip(prog, flag)
pass
else:#单纯的浮点数0.23
num=str(float(befpron))
if IsFLOAT:
print("< FLOAT," + num + " >", end=' ')
flag = theskip
else:
print("< DOUBLE," + num + " >", end=' ')
elif prog[flag+1].isdigit():#浮点数和八进制均有可能
tmp=flag
flag+=2
haspoint=False#是否有小数点、符号、指数
haseE=False
haspro=False
hasneg=False
while prog[flag].isdigit() or prog[flag].lower()=='e' or prog[flag]=='-' or prog[flag]=='+' or prog[flag]=='.':
if (prog[flag]=='+' or prog[flag]=='-') and not haseE:break#需满足的一些条件
elif (prog[flag] == '+' or prog[flag] == '-') and (haspro or hasneg):
break
elif (prog[flag]=='+' or prog[flag]=='-') and haseE:
if prog[flag-1].lower()!='e':break
else:
if prog[flag]=='+':haspro=True
else:hasneg=True
elif (haseE or haspoint) and prog[flag]=='.':break
elif prog[flag].lower() == 'e' and haseE:break
elif prog[flag]=='.' and haspoint:break
elif prog[flag] == '+' :haspro = True
elif prog[flag] == '-':hasneg = True
elif prog[flag].lower() == 'e' :haseE = True
elif prog[flag]=='.' :haspoint=True
flag+=1
befpron = str(reduce(lambda x, y: x + y, prog[tmp:flag]))#拼接
findpro = befpron.find('+')
findneg = befpron.find('-')
findeE = befpron.lower().find('e')
findpoint = befpron.find('.')
IsFLOAT = False
IsERROR = False
theskip = flag
if (haspoint or haseE) and prog[flag].lower() == 'f':#同上
if prog[flag + 1].isalnum():
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag + 1 - befspace) + " >",
end=' ')
IsERROR = True
flag = skip(prog, flag + 2)
else:
IsFLOAT = True
theskip = flag + 1
elif prog[flag].isalnum():
print("< ERROR,NUM_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
IsERROR = True
flag = skip(prog, flag + 1)
if not IsERROR:
if haseE and (haspro or hasneg):
pos = (findpro if haspro > hasneg else findneg)
if pos != flag - tmp - 1: # 处理e+23
suffix = int(reduce(lambda x, y: str(int(x) * 10 + int(y)), befpron[pos + 1:]))
suffix = suffix if haspro else -suffix
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE]))) * 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=' ')
flag=theskip
else:print("< DOUBLE," + num + " >", end=' ')
else: # 0.12e+报错
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
flag = skip(prog, flag)
elif haseE :
if haseE!=flag-tmp-1:
suffix = int(str(reduce(lambda x, y: x + y, befpron[findeE + 1:])))
num=str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE])))*10**suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=' ')
flag=theskip
else:print("< DOUBLE," + num + " >", end=' ')
else:
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
flag = skip(prog, flag)
elif haspoint:
if IsFLOAT:
print("< FLOAT," + str(float(befpron)) + " >", end=' ')
flag = theskip
else:
print("< DOUBLE," + str(float(befpron)) + " >", end=' ')
else:
isoctal=True
for char in befpron:
if int(char)>=8:isoctal=False
if not isoctal:
print("< ERROR,OCTAL_OUTOFRANG,LINE"+str(line)+" "+str(flag-befspace)+" >",end=' ')
else:
print("< INT,"+str(int(befpron,8))+">" ,end=' ')
elif prog[flag+1] in operetor or prog[flag+1] in delimiter:#简单的0
print("< INT,0 >",end=' ')
flag+=1
else:#其他字母或符号抛异常
flag+=2
print("< ERROR,NUM_DEFINE,LINE"+str(line)+" "+str(flag-befspace)+" >",end=' ')
else:#非0开头可为浮点数或整数
haspoint=False#同上
haseE=False
haspro = False
hasneg = False
tmp=flag
while prog[flag].isdigit() or prog[flag].lower() == 'e' or prog[flag] == '-' or prog[flag] == '+' or
prog[flag] == '.':
if (prog[flag] == '+' or prog[flag] == '-') and not haseE:break
elif (prog[flag] == '+' or prog[flag] == '-') and (haspro or hasneg):break
elif (prog[flag] == '+' or prog[flag] == '-') and haseE:
if prog[flag - 1].lower() != 'e':
break
else:
if prog[flag] == '+':
haspro = True
else:
hasneg = True
elif (haseE or haspoint) and prog[flag] == '.':break
elif prog[flag].lower() == 'e' and haseE:break
elif prog[flag] == '.' and haspoint:break
elif prog[flag] == '+':
haspro = True
elif prog[flag] == '-':
hasneg = True
elif prog[flag].lower() == 'e':
haseE = True
elif prog[flag] == '.':
haspoint = True
flag += 1
befpron = str(reduce(lambda x, y: x + y, prog[tmp:flag]))
findpro = befpron.find('+')
findneg = befpron.find('-')
findeE = befpron.lower().find('e')
findpoint = befpron.find('.')
IsFLOAT = False
IsERROR = False
theskip = flag
if (haspoint or haseE) and prog[flag].lower() == 'f':
if prog[flag + 1].isalnum():
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag + 1 - befspace) + " >",
end=' ')
IsERROR = True
flag = skip(prog, flag + 2)
else:
IsFLOAT = True
theskip = flag + 1
elif prog[flag].isalnum():
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
IsERROR = True
flag = skip(prog, flag + 1)
if not IsERROR:
if haseE and (haspro or hasneg):#有符号指数型
pos = (findpro if haspro > hasneg else findneg)
if pos != flag - tmp - 1: # 处理e+23
suffix = int(reduce(lambda x, y: str(int(x) * 10 + int(y)), befpron[pos + 1:]))
suffix = suffix if haspro else -suffix
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE]))) * 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=' ')
flag = theskip
else:
print("< DOUBLE," + num + " >", end=' ')
else: # 0.12e+报错
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
flag = skip(prog, flag)
elif haseE :#指数型浮点数
if haseE != flag - tmp - 1:
suffix = int(str(reduce(lambda x, y: x + y, befpron[findeE + 1:])))
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE]))) * 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=' ')
flag = theskip
else:
print("< DOUBLE," + num + " >", end=' ')
else:
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=' ')
flag = skip(prog, flag)
elif haspoint:#小数点浮点数
if IsFLOAT:
print("< FLOAT," + str(float(befpron)) + " >", end=' ')
flag = theskip
else:
print("< DOUBLE," + str(float(befpron)) + " >", end=' ')
else:#纯整数
num = str(int(befpron))
if prog[flag].lower() == 'u':
if prog[flag + 1].lower() == 'l':
if prog[flag + 2].isalnum():
print("<ERROR,INT_UN_LONG_DEFINE,LINE"+str(line)+" "+str(flag-befspace)+" >", end=' ')
flag += 3
flag = skip(prog, flag)
else:
print("<INT_UN_LONG," + num + ">", end=' ')
flag += 2
elif prog[flag + 1].isalnum():
print("< ERROR,INT_LONG_DEFINE ,LINE"+str(line)+" "+str(flag-befspace)+" >", end=' ')
flag += 1
flag = skip(prog, flag)
else:
print("<INT_UN," + num + ">", end=' ')
elif prog[flag].lower() == 'l':
if prog[flag + 1].lower() == 'u':
if prog[flag + 2].isalnum():
print("< ERROR,INT_UN_LONG_DEFINE ,LINE"+str(line)+" "+str(flag-befspace)+" >", end=' ')
flag += 3
flag = skip(prog, flag)
else:
print("< INT_UN_LONG," + num + ">", end=' ')
flag += 2
elif prog[flag + 1].isalnum():
print("< ERROR,INT_LONG_DEFINE ,LINE"+str(line)+" "+str(flag-befspace)+" >", end=' ')
flag += 1
flag = skip(prog, flag)
else:
print("< INT_LONG," + num + " >", end=' ')
elif prog[flag].isalpha():
print("< ERROR,INT_DEFINE,LINE"+str(line)+" "+str(flag-befspace)+" >", end=' ')
flag = skip(prog, flag)
else:
print("< INT," + num + " >", end=' ')</pre>
Some students also want to use the edge of Python regular expression directly in line with the preface that several were represented, direct match. But which is a little complicated structure can not be represented, and prone to false positives, and can not display error location.
In fact, all of the code have to say a lot of code redundancy, we realize, welcome Paizhuan https://github.com/single-wolf/show-me-the-code/blob/master/analyzer.py
Recently also I like Python, a practiced hand to share interesting stuff https://github.com/Show-Me-the-Code/show-me-the-code