Compilation Principle Experiment 1: Design and Implementation of Source Program Preprocessing and Lexical Analysis Program (python)

Purpose

Design and implement a lexical analysis program including preprocessing functions to deepen the understanding of the lexical analysis process in compilation.

Experimental requirements

1. Realize the preprocessing function

The source program may contain symbols that are meaningless to program execution, and it is required to remove them.
First, compile a source program input process, input several lines of statements from the keyboard, file or text box, and store them in the input buffer (character data) in sequence; then compile a preprocessing subroutine to remove the carriage return, Editing text such as line breaks and tabs; merge multiple blanks into one; remove comments.

2. Realize the lexical analysis function

Input: The source program string for the given grammar.
Output: A sequence of 2-tuples (syn, token or sum). Wherein,
syn is the word classification code.
Token is the character string of the stored word itself.
Sum is an integer constant.
During specific implementation, the two-tuple of words can be processed with a structure.

3. Morphology of the C language subset to be analyzed (it can be expanded by itself, or can be defined according to the lexical definition of C language)

1) Keywords
main if then while do static int double struct break else long switch case typedef char return const float short continue for void default sizeof do
All keywords are lowercase.
2) Operators and delimiters
+ - * / : := < <> <= > >= = ; ( ) #
3) Other tags ID and NUM
define other tags through the following formal forms:
ID→letter(letter|digit)*
NUM→digit digit*
letter→a|…|z|A|…|Z
digit→0|…|9…
4) Spaces are composed of blanks, tabs and newlines.
Spaces are generally used to separate IDs, NUMs, special symbols and keywords, The lexical analysis phase is usually ignored.

4. The category codes corresponding to various word symbols

There are some errors in the category code provided by the title. The actual requirement is that it can be defined by yourself. Here is a reference:

keyword = {
    
    'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {
    
    '+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}

5. The main algorithm idea of ​​the lexical analysis program

The basic task of the algorithm is to identify the word symbol with independent meaning from the source program represented by the string, and its basic idea is to spell out the corresponding word symbol according to the type of the first character of the scanned word symbol.

This program reads the data to be analyzed from test.txt

code:

import re
def pre(file):
    data = file.read()
    out =[]
    data = re.sub(re.compile('/\*{1,2}[\s\S]*?\*/'),"",data)
    data = re.sub(re.compile('//[\s\S]*?\n'), "", data)
    data = data.split("\n")
    for i in data:
        i = i.strip(' ').replace('\t', '')
        i = ' '.join(str(i).split())
        if(i!=''):
            out.append(i)
    return out
keyword = {
    
    'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {
    
    '+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}
with open('test.txt', 'r') as file:
    data = pre(file)
    #print(data)
    for i in range(len(data)):
        pattern1 = re.compile('[a-zA-Z.0-9]+')
        line = re.findall(pattern1,data[i])
        for j in line:
            if j in keyword:
                print(j+' -> '+str(keyword[j]))
            elif str(j).isdigit():
                print("'"+str(j)+"' -> 26")
            else:
                j = str(j).strip('.')
                print("'"+j+"' -> 25")
        line2 = re.sub(pattern1," ",data[i])
        line2=line2.split(" ")
        for j in line2:
            if j in operator:
                print(j+' -> '+str(operator[j]))

Test text:

#include<stdio.h>
struct student
{
    
    
    int id;
    long int counts;
    /*

asfagsaf

    */
    /* data */
};
student stu[2000000];
int main(){
    
    
    for(long int i=0;i<2000000;i++){
    
    
        stu[i].id=i;
        stu[i].counts=0;
    }
    long int n,m;
    int a;
    scanf("%d",&n);
    for(long int i=0;i<n;i++){
    
    
        scanf("%ld",&a);
        stu[a].counts++;
    }
    scanf("%ld",&m);
    for(long int i=0;i<m;i++){
    
    
        scanf("%d",&a);
        if(stu[a].counts==0){
    
    
            printf("NO\n");
        }
        else{
    
    
            printf("YES\n");
        }
    }
    return 0;
}

output:

include -> 44
stdio.h -> 40
# -> 0
< -> 33
> -> 36
struct -> 9
'student' -> 25
{ -> 46
int -> 7
'id' -> 25
; -> 41
long -> 12
int -> 7
'counts' -> 25
; -> 41
'student' -> 25
'stu' -> 25
'2000000' -> 26
int -> 7
main -> 1
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'2000000' -> 26
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
'stu' -> 25
'i' -> 25
'id' -> 25
'i' -> 25
= -> 38
; -> 41
'stu' -> 25
'i' -> 25
'counts' -> 25
'0' -> 26
= -> 38
; -> 41
} -> 47
long -> 12
int -> 7
'n' -> 25
'm' -> 25
; -> 41
int -> 7
'a' -> 25
; -> 41
scanf -> 48
'd' -> 25
'n' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'n' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'ld' -> 25
'a' -> 25
'stu' -> 25
'a' -> 25
'counts' -> 25
} -> 47
scanf -> 48
'ld' -> 25
'm' -> 25
for -> 22
long -> 12
int -> 7
'i' -> 25
'0' -> 26
'i' -> 25
'm' -> 25
'i' -> 25
( -> 42
= -> 38
; -> 41
< -> 33
; -> 41
scanf -> 48
'd' -> 25
'a' -> 25
if -> 2
'stu' -> 25
'a' -> 25
'counts' -> 25
'0' -> 26
( -> 42
printf -> 49
'NO' -> 25
'n' -> 25
} -> 47
else -> 11
{ -> 46
printf -> 49
'YES' -> 25
'n' -> 25
} -> 47
} -> 47
return -> 17
'0' -> 26
; -> 41
} -> 47

insert image description here

Points to note:

  1. After preprocessing the file, the process of extracting words and symbols from the data uses regular expressions. Because, there [a-zA-Z.0-9]+will be a small bug here, which is regarded as a logical stu[i].counts=0;loophole .. merged .into .counts. The solution is to remove it when outputting ..
  2. The regular expression used in the preprocessing file: /\*{1,2}[\s\S]*?\*/, the purpose is to remove multi-line comments, and //[\s\S]*?\nthe purpose is to remove single-line comments.
  3. The current output does not seem to be in the order of keywords, but in the order of keywords first, then symbols. This should be improved.

correct:

In the later period, some supplements and improvements were made to the intermediate process, so that the output sequence of the analysis was carried out in sequence.

import re
def pre(file):
    data = file.read()
    out =[]
    data = re.sub(re.compile('/\*{1,2}[\s\S]*?\*/'),"",data)
    data = re.sub(re.compile('//[\s\S]*?\n'), "", data)
    data = data.split("\n")
    for i in data:
        i = i.strip(' ').replace('\t', '')
        i = ' '.join(str(i).split())
        if(i!=''):
            out.append(i)
    return out
keyword = {
    
    'main':1,'if':2,'then':3,'while':4,'do':5,
           'static':6,'int':7,'double':8,'struct':9,
           'break':10,'else':11,'long':12,'switch':13,
           'case':14,'typedef':15,'char':16,'return':17,
           'const':18,'float':19,'short':20,'continue':21,
           'for':22,'void':23,'ID':25,'NUM':26,'default':39,
           'sizeof':24,'stdio.h':40,'include':44,'scanf':48,'printf':49}
operator = {
    
    '+':27,'-':28,'*':29,'/':30,':':31,':=':32, '<':33,
            '<>':34,'<=':35,'>':36,'>=':37,'=':38,';':41,'(':42,
            ')':43,'#':0,'{':46,'}':47}
with open('test2.txt', 'r') as file:
    data = pre(file)
    print("\n文本预处理后的内容:\n")
    print(data)
    print()
    for i in range(len(data)):
        line = re.split(r"([a-zA-Z.0-9]+)",data[i])
        for j in line:
            if j == '' or j == ' ':
                continue
            if j == '\\' and line[line.index(j) + 1] == 'n':
                line[line.index(j) + 1] = ''
                continue
            if j in keyword:
                print('('+str(keyword[j])+','+j+')')
            elif str(j).isdigit():
                print("(26,'"+str(j)+"')")
            elif j in operator:
                print('(' + str(operator[j])+','+j+')')
            elif j.isalpha():
                j = str(j).strip('.')
                print("(25,'"+j+"')")
            else:
                temp = str(j)
                for t in temp:
                    if t in operator:
                        print('(' + str(operator[t])+','+t+')')

The main operation is to change the use of regular expressions. Originally, it used expressions to extract, but now it uses expressions to split strings, so the effect is better.
Output:
sample1:
insert image description heresample2:
insert image description here

Guess you like

Origin blog.csdn.net/qq_51594676/article/details/127983533