C language lexical analyzer (C++ language implementation)

This is an experimental class task of compilation principles assigned by the teacher. After class, it took nearly a week to write code (mainly C++ has been useless for too long, many functions are unfamiliar, and a lot of information has been consulted). There is no grammatical error judgment function for analysis. If you want to add functions, you can add them in the relevant function code segment. The type code of this law unit is just a simple distinction, but you can add it yourself.

The lexical analysis process of c language is shown in the figure:

Insert picture description here

The code is as follows: The code this time is an exercise in my preliminary knowledge of code cleanliness, especially the naming of variables and the writing of functions . If you think it is well written , you can check out my related articles on code cleanliness.

There are basically no comments in the code this time, but I believe that everyone can understand it in combination with the flowchart. If you think it is too long , you can copy the code to visual studio and close the function code block so that it looks logically clear.

#include <iostream>
#include <string>
#include <map>
#include <ctype.h>
#include <algorithm>
using namespace std;
string readFile(string fileName);
string fileFilter();
string singleLineCommentsFilter();
string multilineCommmentsFileter();
string specialCharacterFilter();
void separateAndJudge();

bool isReservedWord(string vocabulary);
void separateAndJudge();
void showTokenData();
int digitStarted(int cnt);
bool isBoundSymbol(char ch);
bool isOperator(char ch);
int judgeStartingCharactorType(char ch);
bool isDigit(char ch);
bool isAlpha(char ch);
int alphaStarted(int cnt);
int underlineStarted(int cnt);
string transCharToString(char ch);
string codeSource;
map<string, int> tokens;//identifier is 1,reservedWord 2,digit 3,borderSymbol 4,operator 5
int main()
{
    
    
	codeSource = readFile("h:\\testCode.txt");
	cout << "This is source code" << endl << "---------------------------------" << endl << endl;
	cout << codeSource << endl;

	cout << "This is code filtered" << endl << "---------------------------------" << endl ;
	codeSource = fileFilter();
	codeSource = fileFilter();
	cout << codeSource << endl;

	separateAndJudge();

	cout << "this is tokens " << endl;
	showTokenData();
	return 0;
}


string readFile(string fileName)
{
    
    
	FILE* fp;
	if ((fp = fopen(fileName.c_str(), "r")) == NULL)
	{
    
    
		cout << "cant open file";
		exit(0);
	}
	else
	{
    
    
		string codeSource;
		char ch;
		while ((ch = fgetc(fp)) != EOF)
		{
    
    
			codeSource += ch;
		}
		return codeSource;
	}
};
string fileFilter()
{
    
    
	string filteredCode = singleLineCommentsFilter();
	filteredCode = multilineCommmentsFileter();
	filteredCode = specialCharacterFilter();
	return filteredCode;
};
void separateAndJudge()
{
    
    
	int cnt = 0;
	for (; cnt < codeSource.length(); cnt++)
	{
    
    
		int nowCnt = 0;
		while (codeSource[cnt] != ' ' and cnt < codeSource.length())
		{
    
    
			string a = "";
			switch (judgeStartingCharactorType(codeSource[cnt]))
			{
    
    
			case 1:
				cnt = digitStarted(cnt);
				break;
			case 2:
				cnt = alphaStarted(cnt);
				break;
			case 3:
				cnt = underlineStarted(cnt);
				break;
			case 4:		
				tokens[transCharToString(codeSource[cnt])] = 4;
				cnt++;
				break;
			case 5:				
				tokens[transCharToString(codeSource[cnt])] = 5;
				cnt++;
				break;
			case 6:
				cout << "wrong grammer" << endl;
				exit(0);
				cnt++;
				break;
			default:
				cnt++;
				break;
			}
			
		}
	}
}
void showTokenData()
{
    
    
	
	map<string, int>::iterator iter;

	for (iter = tokens.begin(); iter != tokens.end(); iter++)

		cout << iter->first << ' ' << iter->second << endl;
}


string singleLineCommentsFilter()
{
    
    
	long cnt = 0;
	for (; cnt < codeSource.length(); cnt++)
	{
    
    
		while (codeSource[cnt] == '/' and codeSource[cnt + 1] == '/' and cnt < codeSource.length())
		{
    
    
			while (codeSource[cnt] != '\n')
			{
    
    
				codeSource.erase(cnt, 1);
			}
		}
	}
	return codeSource;
}
string multilineCommmentsFileter()
{
    
    
	int cnt = 0;
	for (; cnt < codeSource.length(); cnt++)
	{
    
    
		if (codeSource[cnt] == '/' and codeSource[cnt + 1] == '*' and cnt < codeSource.length())
		{
    
    
			do
			{
    
    
				codeSource.erase(cnt, 1);
				if (codeSource[cnt+2]==EOF)
				{
    
    
					cout << "multilineCommments wrong" << endl;
					exit(0);
				}
			} while (codeSource[cnt + 2] != '*' and codeSource[cnt + 3] != '/');
			codeSource.erase(cnt, 4);
		}

	}
	return codeSource;
}
string specialCharacterFilter()
{
    
    
	for (int cnt = 0; cnt < codeSource.length(); cnt++)
	{
    
    
		if (codeSource[cnt] == '\n' or codeSource[cnt] == '\t' or codeSource[cnt] == '\v' or codeSource[cnt] == '\r')
		{
    
    
			codeSource.erase(cnt, 1);
			cnt--;
		}
	}
	return codeSource;
}


int judgeStartingCharactorType(char ch)
{
    
    
	int type = 0;
	if (isDigit(ch)) {
    
     type = 1; }
	else
	{
    
    
		if (isAlpha(ch)) {
    
     type = 2; }
		else
		{
    
    
			if (ch == '_') {
    
     type = 3; }
			else
			{
    
    
				if (isBoundSymbol(ch)) {
    
     type = 4; }
				else
				{
    
    
					if (isOperator(ch)) {
    
     type = 5; }
					else {
    
     type = 6; }
				}
			}
		}
	}
	return type;
}

int digitStarted(int cnt)
{
    
    
	string digit;
	digit += codeSource[cnt];
	cnt++;
	while (isDigit(codeSource[cnt]) or codeSource[cnt] == '.')
	{
    
    
		digit += codeSource[cnt];
		++cnt;
	}
	tokens[digit] = 3;
	return cnt;
}
int alphaStarted(int cnt)
{
    
    
	string alpha;
	alpha += codeSource[cnt];
	cnt++;
	while (isAlpha(codeSource[cnt]) or isDigit(codeSource[cnt])  or codeSource[cnt]=='_')
	{
    
    
		alpha += codeSource[cnt];
		++cnt;
	}
	if (isReservedWord(alpha)) {
    
     tokens[alpha] = 2; }
	else {
    
     tokens[alpha] = 1; }
	return cnt;
}
int underlineStarted(int cnt)
{
    
    
	string word;
	word += codeSource[cnt];
	cnt++;
	while (isAlpha(codeSource[cnt]) or isDigit(codeSource[cnt]))
	{
    
    
		word += codeSource[cnt];
		++cnt;
	}
	tokens[word] = 1;
	return cnt;
}


string transCharToString(char ch)
{
    
    
	string temp = " ";
	temp[0] = ch;
	return temp;
}


bool isReservedWord(string vocabulary)
{
    
    
	string reserveWords[32] = {
    
    
	 "auto", "break", "case", "char", "const", "continue",
	"default", "do", "double", "else", "enum", "extern",
	"float", "for", "goto", "if", "int", "long",
	"register", "return", "short", "signed", "sizeof", "static",
	"struct", "switch", "typedef", "union", "unsigned", "void",
	 "volatile", "while"
	};
	bool flag = false;
	for (int i = 0; i <32 ; i++)
	{
    
    
		if (reserveWords[i]==vocabulary)
		{
    
    
			flag = true;
		}
	}
	return flag;
};

bool isBoundSymbol(char ch)
{
    
    
	string temp = "";
	temp += ch;
	bool flag = false;
	string boundSymbol[6] =
	{
    
    
		"(",   ")",   ",",   ";",  "{",  "}"
	};
	for (int i = 0; i < 6; i++)
	{
    
    
		if (boundSymbol[i] == temp)
		{
    
    
			flag = true;
		}
	}
	return flag;
}
bool isOperator(char ch)
{
    
    
	string temp = to_string(ch);
	bool flag = false;
	string operators[9] =
	{
    
    
		"+","-","*","/","=","%",">","<","=",
	};
	for (int i = 0; i < 9; i++)
	{
    
    
		if (operators[i] == temp)
		{
    
    
			flag = true;
		}
	}
	return flag;
}
bool isDigit(char ch)
{
    
    
	bool flag = false;
	if (ch >= '0' and ch <= '9')
	{
    
    
		flag = true;
	}
	return flag;
}
bool isAlpha(char ch)
{
    
    
	bool flag = false;
	if ((ch >= 'a' and ch <= 'z') or (ch >= 'A' and ch <= 'Z'))
	{
    
    
		flag = true;
	};
	return flag;
}

C language lexical analyzer (C++ language implementation)

C language lexical analyzer (C++ language implementation)

Guess you like