C++学习之第六天-统计圣经文本的词频

二、编程题--统计圣经出现的单词以及词频。

1、统计一篇英文(The_Holy_Bible.txt)文章中出现的单词和词频，输入：某篇文章的绝对路径输出：词典（词典中的内容为每一行都是一个“单词词频”）

词典的存储格式如下：


|   a 66          |
|   abandon 77    |
|   public 88     |
|    ......	      |
|_________________|

struct Record
{
	string _word;
	int _frequency;
};

class Dictionary
{
public:
	//......
    void read(const std::string &filename);
    void store(const std::string &filename);
private:
	vector<Record> _dict;
};

提示：因为我们需要统计圣经文件中单词以及该单词在文件中出现的次数，所以可以看去读圣经文件，然后将单词存到数据结构中，并记录单词的次数，如果单词第二次出现的时候，只需要修改单词的次数（也就是这里说的单词的频率），这样当统计完整个圣经文件后，数据都存在数据结构vector了。接着遍历vector数据结构就可以将单词以及单词次数(也就是频率)存储到另外一个文件。(当然如果不存到另外一个文件，就只能打印到终端了)

注意：在读圣经文件的时候，有可能字符串是不合法的，比如：abc123 abc？这样的字符串，处理方式两种：直接不统计这样的字符串或者将非法字母去掉即可。

最终得到结果类似：

a 10

public 20

welcome 30.......

main.cpp

主函数部分：

1.读取圣经数据，统计单词词频，存入vector

2.对vector进行排序

3.遍历或者把结果直接存入文件

#include "Bible.h"

void test01()
{
	Dictionary s1;
	s1.read("bible.txt"); //1.读取圣经数据，统计单词词频，存入vector
	s1.dict_Sort();		//2.对vector中的单词进行排序
	//s1.foreach_vector();//3.遍历词典vector，把结果输出到控制台
	s1.store("my_dict.txt");//4.把结果存入文件
}
int main()
{

	test01();

	system("pause");
	return EXIT_SUCCESS;
}

Bible.h

1.头文件部分

1.定义一个Record结构体，存储单词以及其词频

2.定义一个类，类里有个字典vector，用来存储Record

3.类里主要需要完成的函数：

a.read(file):读取数据，进行数据处理，把结果存入dict vector中

1.ifstream 从文件中读取数据

2.ifstream的用法，设置循环读取，每次读取一个单词

3.对读出来的单词进行处理：大写转小写、标点符号转空格

4.继续处理步骤3转换过的单词，去掉单词的首尾空格

5.单词去掉首尾空格后，如果中间还有空格，该单词不合法

6.判断单词是否合法，如果合法，把单词存入结构体

7.判断单词是否在vector中出现，出现，对应词频加1，没出现，push_back.

b.dict_Sort()；用algorithm中的sort对vector进行排序，自己写排序规则

c.遍历vector,把结果输出到控制台

d.把vector中的数据对齐，并且写入到文件，ofstream

Bible.h

#pragma once
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
using namespace std;
#include<string>
#include <vector>
#include <sstream>
#include <fstream>
#include <algorithm>
#include <iomanip>
struct Record
{
	string _word;
	int _frequency;
};

class Dictionary
{
public:
	void read(const string &filename);//1.读取圣经，进行字符统计，把结果并且存入_dict vector
	
	void dict_Sort();//排序

	void foreach_vector();//遍历vector

	void store(const string &filename);//2.从vector中把结果写入新文件

private:
	vector <Record> _dict;
	string int2String(int number);//int类型转为字符串流，ostringstream

	void trim(string &s);//string字符串去掉首尾空格

	int compare(vector <Record> &vec, string &word);//判断_dict中是否有该单词，如果有，返回该单词的下标，没有返回-1

	bool judgefullword(string &word);//判断单词是否正确


};

bible.cpp


#include "Bible.h"


void Dictionary::read(const string &filename)//读取数据并且进行处理
{
	ifstream ifs(filename);//1.ifstream 从文件中读取数据
	if (!ifs.good())//文件不存在就退出
	{
		cerr << "ifstream is not good" << endl;
		return;
	}

	string word;
	while (ifs >> word)//2.默认以空格为分隔符，每次读一个单词
	{
		//istringstream ss(word); //把word变成string流
		//string key; //用来接收string
		//ss >> key; //key接收读出来的字符串

		//把key中的字符，大写变成小写
		for (int i = 0; i < word.size(); i++)//3.把单词的大写转成小写，非字母类型转为空格
		{
			if (word[i] >= 'A' && word[i] <= 'Z')
			{
				word[i] += 32;
			}

			if (word[i] >= 'a' &&word[i] <= 'z')
			{
				continue;
			}

			word[i] = ' '; //标点符号变成空格
		}

		trim(word); //4.去掉word的首尾空格,如果word全是空字符， 会把word置空，word.empty()会为真

		bool flag_no_error = judgefullword(word);//5.去掉首尾空格后，单词中间还有其他不合法字符，就跳过这个单词

		if (!word.empty()&&flag_no_error==true)//6.单词不为空，并且为合法单词
		{
			if (_dict.empty())//7.如果_dict为空，直接把Record插入，不需要比较
			{
				struct Record r = { word, 1 };
				_dict.push_back(r);
			}
			else
			{
				int ret = compare(_dict, word);//8.如果_dict不空，先判断_dict中是否有该单词，有的话，对应词频加1
				if (ret != -1)				
				{
					_dict[ret]._frequency++;//ret是word在_dict中的位置，词频加1

				}
				else
				{
					struct Record r = { word, 1 };//_dict中没有这个单词，把word插入到vector
					_dict.push_back(r);
				}
			}	
		}
		

	}
}
//string字符串去掉首尾空格
void Dictionary::trim(string &s)
{

	if (!s.empty())
	{
		s.erase(0, s.find_first_not_of(" "));
		s.erase(s.find_last_not_of(" ") + 1);
	}

}
//判断_dict中是否有该单词，如果有，返回该单词的下标，没有返回-1
int Dictionary::compare(vector <Record> &vec, string &word)
{
	for (int i = 0; i < vec.size(); i++)
	{
		if (vec[i]._word == word)
		{
			return i;
		}

	}

	return -1;
}
//判断读出来的单词是否正确
bool Dictionary::judgefullword(string &word)
{
	for (int i = 0; i < word.size(); i++)
	{
		if (word[i]<'a' || word[i]>'z')
			return false;
	}
	return true;
}

//遍历_dict，对齐输出到控制台
void Dictionary::foreach_vector()
{
	for (int i = 0; i < _dict.size(); i++)
	{
		cout << int2String(i + 1) << "  "
			<< setiosflags(ios::left) << setw(32) << _dict[i]._word << "\t"
			<< _dict[i]._frequency << endl;
	}
}
//把int转成string
string Dictionary::int2String(int number)
{
	ostringstream oss;
	oss << number;  //把number写入oss流
	return oss.str();//返回oss字符串
}

//用于dict_Sort,按照单词的大小来排序
bool string_compare(struct Record &p1, struct Record &p2) 
{
	return p1._word < p2._word;
}

void Dictionary::dict_Sort()
{
	sort(_dict.begin(), _dict.end(), string_compare);
}

//把统计数据输出到文件
void Dictionary::store(const string &filename)
{
	ofstream ofs(filename);//ofstream,把结果数据输出到文件

	if (!ofs.good())
	{
		cerr << "ofstream is not good " << endl;
		return;
	}
	
#if 0
	for (int i = 0; i < _dict.size(); i++)
	{
		ofs <<int2String(i+1) <<"  "<<_dict[i]._word << "        " << _dict[i]._frequency << endl;
	}
#endif
	ostringstream my_ss;
	for (int i = 0; i < _dict.size(); i++)
	{
		//1.把数据先对齐写入字符串流
		my_ss << int2String(i + 1) << "  " //单词序号，把int转成string
			<<setiosflags(ios::left)<<setw(32)<< _dict[i]._word<<"\t"//单词对齐
			<< _dict[i]._frequency  << endl;//词频
	}
	//cout << my_ss.str() << endl;
	//2.再把数据写入到文件
	ofs << my_ss.str() << endl;
	ofs.close();
}

C++学习之第六天-统计圣经文本的词频

猜你喜欢