基于C++的词法分析器

实验目的

通过设计编制调试一个具体的词法分析程序，加深对词法分析原理的理解。并掌握在对程序设计语言源程序进行扫描过程中将其分解为各类单词的词法分析方法。

编制一个读单词过程，从输入的源程序中，识别出各个具有独立意义的单词(token)，即基本保留字、标识符、常量、运算符、分隔符五大类，并依次输出各个单词的内部编码及单词符号自身值。（遇到错误时可显示“Error”，然后跳过错误部分继续显示）

资源下载地址：https://download.csdn.net/download/qq_41406816/10791074

输出结果/输入内容

输出内容

二、变量

基本保留字：if、int、for、while、do、return、break、continue；（还可以加载很多，但是我的代码里面只写了这几个）

运算符：+、-、*、/、=、>、<、>=、<=、!=

分隔符：、;、{、}、(、)

常量：小数、整数、科学记数法、负号的识别

注释：单行注释（//）、多行注释（/**/）

三、主要识别的功能：

识别上面的变量
{}和（）的匹配
空白、换行符、制表符的剔除

四、主要函数介绍

digitprocess()//识别常量

alphaprocess()//字符，通过search函数判断是保留字还是标识符

search()//判断是保留字还是标识符

otherprocess()//其他的。运算符，注释，分隔符

Judge()//利用出入栈，辨别{}和（）是否配对

（1）注释：

用token数组存变量。当token[0]=’/’时，在读取一个字符，如果token[1]=’*’，通过循环取出注释。对于/**/ 判断遇到‘/’则停止循环。对于// 遇到换行符则停止循环。如果token[1]='/'，读取到换行符，循环就停止。

	if (token == "/")
		{
			fin.get(x);
			token += x;
			if (token[1] == '=')// /=
			{
				cout << "(4," << token << ")" << endl;
				fin.get(x);
				id = 4;
				return x;
			}
			if (token[1] == '*' || token[1] == '/')// 注释 6 第一种 是遇到/就停止，第二种是换行
			{ 
				int i = 1;
				if (token[1] == '*')
				{
					while (token[i] != '/')/* */
					{
						fin.get(x);
						token += x;
						i++;
					}
				
				}
				else
				{
					while (token[i] != '\n')
					{
						fin.get(x);
						token += x;
						i++;
					}
				}
				cout << "(6," << token.substr(0,i) << ")" << endl;
				id = 6;
				fin.get(x);
				return x;
			}
		}

（2）识别实数（如3.2），无符号数（2.1E6）

while (isdigit(butter)||butter=='.'||butter=='E')
	{
		if (butter == 'E')
		{
			token += butter;
			fin.get(x);//读取下一个字符
			butter = x;
			while (isdigit(butter) || butter == '+' || butter == '-')
			{
				token += butter;//连接字符串
				fin.get(x);//读取下一个字符
				butter = x;
			}
			cout << "(3," << token << ")" << endl;
			id = 3;
			return x;
		}
		token += butter;//连接字符串
		fin.get(x);//读取下一个字符
		butter = x;
	}

（3）检查大小括号是否匹配。使用出入栈。思想大概是：只要是左括号，让他入栈。如果读到右括号，则与当前栈顶的括号相匹配，如果是对应的左括号，则左括号出栈，以此类推。如果括号不匹配，则退出。（网上有很多版本的代码，读者可以自己去搜索）下面的代码也有我自己的出入栈。

（4）辨别负号

经过推理发现，只要“-”前面是运算符，那么这个“-”一定是负号。所以我们可以通过记录前面的字符来辨别。我是用id来记录每一次的数据。

四、代码（经供参考）

运行环境 vs2017+win10

#include<stdio.h>
#include<string.h>
#include<string>
#include <ctype.h>
#include <malloc.h>
#include <stdlib.h>
#include<iostream>
#include<fstream>
#include<stack>
using namespace std;
const char *keyword[8] = { "break","if","continue","while","do","int","for","return"};
char x, cbuffer;
ifstream fin("1.txt", ios::in);//文件读取
int id;//用id来记录上一个操作是什么
int big[2] = { 0,0 };
int small[2] = { 0,0 };
stack <char>s;
bool re;
bool Judge(string str) {  //使用栈判断括号匹配
	for (int i = 0; i < str.length(); i++) {
		switch (str[i]) {
		case '(':
			s.push('(');
			break;
		case '[':
			s.push('[');
			break;
		case '{':
			s.push('{');
			break;

		case ')':
			if (!s.empty())
			{
				if (s.top() == '(')
					s.pop();
				else
					return false;
			}
			else
				return false;
			break;
		case ']':
			if (!s.empty())
			{
				if (s.top() == '[') {
					s.pop();
				}
				else {
					return false;
				}
			}
			else
				return false;
			break;
		case '}':
			if (!s.empty())
			{
				if (s.top() == '{') {
					s.pop();
				}
				else {
					return false;
				}
			}
			else
				return false;
			break;
		}
	}
	if (s.empty()) {
		return true;
	}
	else {
		return false;
	}
}
char digitprocess(char butter)
{
	string token="" ;
	while (isdigit(butter)||butter=='.'||butter=='E')
	{
		if (butter == 'E')
		{
			token += butter;
			fin.get(x);//读取下一个字符
			butter = x;
			while (isdigit(butter) || butter == '+' || butter == '-')
			{
				token += butter;//连接字符串
				fin.get(x);//读取下一个字符
				butter = x;
			}
			cout << "(3," << token << ")" << endl;
			id = 3;
			return x;
		}
		token += butter;//连接字符串
		fin.get(x);//读取下一个字符
		butter = x;
	}
	cout << "(3," << token << ")" << endl;
	id = 3;
	return x;
}
bool search(string list)
{//因为有8个保留字
	bool is = false;
	for (int i = 0; i < 8; i++)
	{
		if (strcmp(keyword[i], list.c_str()) == 0)//如果匹配到了，那么就是保留字
		{
			is = true;
			break;
		}
	}
	return is;
}
char alphaprocess(char butter)
{
	string token = "";
	//token+=butter;
	while (isalpha(butter))//读取整个由字母组成的字符串，最后通过search函数来辨别是标识符还是保留字
	{
		token += butter;
		fin.get(x);
		butter = x;
	}
	
	if (search(token))
	{
		cout << "(1," << token << ")" << endl;
		fin.get(x);
		id = 1;
		
	}
	else if(!search(token))
	{
		
		cout << "(2," << token << ")" << endl;
		id = 2;
	}
	
	
	//fin.get(x);
	return x;
}
char otherprocess(char butter)
{
	string token = "";
	token += butter;

	//,;{}()是分隔符 5
	if (token == "," || token == ";")
	{
			
		cout << "(5," << butter << ")" << endl;
		fin.get(x);
		id = 5;
		return x;
	}
	if (token == "{" || token == "}" || token == "(" || token == ")")
	{
		re=Judge(token);
		cout << "(5," << butter << ")" << endl;
		fin.get(x);
		id = 5;
		return x;
	}
	// * or / or *= or /= 是运算符4 /**/ or // 注释6
	if (token == "*" || token == "/")
	{
		if (token == "*")
		{
			fin.get(x);
			token += x;
			if (token[1] == '=')// *=
			{
				cout << "(4," << token << ")" << endl;
				fin.get(x);
				id = 4;
				return x;
			}
		}
		if (token == "/")
		{
			fin.get(x);
			token += x;
			if (token[1] == '=')// /=
			{
				cout << "(4," << token << ")" << endl;
				fin.get(x);
				id = 4;
				return x;
			}
			if (token[1] == '*' || token[1] == '/')// 注释 6 第一种 是遇到/就停止，第二种是换行
			{ 
				int i = 1;
				if (token[1] == '*')
				{
					while (token[i] != '/')/* */
					{
						fin.get(x);
						token += x;
						i++;
					}
				
				}
				else
				{
					while (token[i] != '\n')
					{
						fin.get(x);
						token += x;
						i++;
					}
				}
				cout << "(6," << token.substr(0,i) << ")" << endl;
				id = 6;
				fin.get(x);
				return x;
			}
		}
		//排除了注释和 /= *= 那么 就只是单纯的 * /了
		cout << "(4," << token << ")" << endl;
		id = 4;
		fin.get(x);
		return x;
	}
	if (token == "=" || token == "!" || token == "<" || token == ">")
	{
		fin.get(x);
		
		if (token[1] == '=')
		{	token += x;
			cout << "(4," << token << ")" << endl;
			id = 4;
			fin.get(x);
		}
		cout << "(4," << token << ")" << endl;
		id = 4;
		return x;
	}
	if (token == "+" || token == "-")
	{
		if (id == 4)//在运算符后面的-，一定是负号
		{
			int i = 1;
			fin.get(x);
			token += x;
			if (isdigit(token[1]))
			{
				while ((isdigit(token[i]) || token[i] == 'E' || token[i] == '.'))
				{
					fin.get(x);
					token += x;
					
					i++;
				}
				cout << "(3," << token.substr(0,i) << ")" << endl;
				id = 3;
				return x;
			}
			

		}
		if (token == "+")// ++ += 
		{
			fin.get(x);
			token += x;
			if (token[1] == '=' || token[1] == '+')
			{
				cout << "(4," << token << ")" << endl;
				id = 4;
				fin.get(x);
				return x;
			}
			cout << "(4," << token[0] << ")" << endl;
			id = 4;
			//fin.get(x);
			return x;
		}
		if (token == "-")//-- -=
		{
			fin.get(x);
			token += x;
			if (token[1] == '=' || token[1] == '-')
			{
				cout << "(4," << token << ")" << endl;
				id = 4;
				fin.get(x);
				return x;
			}
			cout << "(4," << token << ")" << endl;
			id = 4;
			fin.get(x);
			return x;
		}
	}
}
int main()
{
	
	//ifstream fin("1.txt", ios::in);
	if (!fin)
		cout << "error";
	fin.get(x);
	while (fin.peek()!=EOF)
	{
		//1基本保留字、2标识符、3常量、4运算符、5分隔符. 6注释
		if (x == ' ' || x == '\n' || x == '\t')
			fin.get(x);
			//continue;
		if (isdigit(x))//数字
		{
			x = digitprocess(x);//返回当前X所在位置
		}
		else if (isalpha(x))//字母
		{
			x = alphaprocess(x);
			//cout << "2" << x << endl;
		}
		else
		//cout << "1" <<x<< endl;
			x=otherprocess(x);
	}
	if (re)
	{
		cout << "{}与（）匹配成功" << endl;
	}
	else
	{
		cout << "{}与（）匹配失败" << endl;
	}
	fin.close();
	return 0;
}

运行结果

欢迎大家来交流意见~~