在做C++教材练习题时,其中一题要求统计文本文档中的汉字个数,当我直接按char ch做的时候,发现统计出的字数和文本文档的字节数相同。why?
也就是说,常规方法下统计的实际不是文本文档中的字符个数,而是这些字符的字节和。
下面是我原先的代码:
#include <iostream>
#include <strstream>
#include <fstream>
#include <io.h>
#include <string.h>
using namespace std;
int FrequencyCounts(char *);
int main()
{
long handle;
struct _finddata_t info;
handle = _findfirst("*.txt", &info);
if (handle == -1)
return 0;
int count[3];
int i = 0;
do
{
count[i] = FrequencyCounts(info.name);
++i;
} while (_findnext(handle, &info) == 0);
char output_name[100];
strcpy(output_name, "多个文件汉字字频统计结果.txt");
ofstream out;
out.open(output_name, ios::app);
if (!out)
{
cout << "Can't open the file!" << output_name << endl;
exit(0);
}
for (i = 0; i < 3; ++i)
{
out << "test" << i+1 << ".txt" << "中共有" << count[i] << "个字" << endl;
}
out.close();
cout << "统计完毕" << endl;
return 0;
}
int FrequencyCounts(char *a)
{
ifstream in(a);
if (!in)
{
cout << "Can't open the file!" << a << endl;
exit(0);
}
char ch;
int count = 0;
while (in.get(ch))
{
count++;
}
in.close();
return count;
}
最后如何改正呢?参考https://blog.csdn.net/bufanq/article/details/51034156的文章,
改为:
while (in.get(ch))
{
if((ch & 0x80) == 0x80)
count++;
}
仅此记录。