【笔记】C++ 多线程读入数据

差的好多，抓紧学了。

参考链接：

https://blog.csdn.net/nirendao/article/details/88096195
std::istream::seekg
std::chrono::steady_clock

1. 计算文件大小

std::ifstream的两个成员函数

unsigned tellg()：返回输入流的当前位置，也就是下一个要读的字符距离首字符的偏移量。
istream& seekg(off, way)，设置输入流的当前位置。
off接受一个整数，表示要改动的偏移量。
way表示从哪里开始计算，可以选择is.beg, is.end, is.cur三种之一，分别表示流的开头、结尾和当前位置，其中is表示这个输入流，默认为is.beg。如果选择了is.end，会朝前推off.

如果需要计算文件大小，只需要先seekg(0, is.end)，再tellg就可以获得。

#include <iostream>    
#include <fstream> 

int main(void)
{
	clock_t t_st = clock();
	std::ifstream is ("../data/data100mb_rand.txt");
	if(is) 
	{
		is.seekg(0, is.end);
		long long length = is.tellg();
		printf("%lld\n",length );
		is.close();
	}
	else
	{
		printf("file error\n");
	}
	printf("time_cost = %.2fs\n",double(clock()-t_st)/CLOCKS_PER_SEC );

	return 0;
}

2. 设置读入点、读入量

在我的需求中，整个文件由若干行组成，需要每行独立处理，且每行不会很长（50字节左右）。

为了在多线程并发读入中，让所有行不重不漏，需要设置每个线程的起始读入点以及读入量。

方法：

首先按文件大小平均分割，找到初始读入点
除第一个读入点外，所有读入点处执行一次getline操作，找到下一行开头。
通过计算相邻读入点位置的差，即可得到每个进程的读入量。

const int threads = 8;
const string file_path = "../data/data100mb_rand.txt";

vector<pair<long long, long long>> get_read_point()
{
	ifstream is(file_path);
	if(!is)
	{
		printf("file error\n");
		exit(1);
	}
	is.seekg(0, is.end);
	long long length = is.tellg();

	vector<pair<long long, long long>> res(threads);
	
	res[0].first = 0;
	for(int i=1; i<threads; ++i)
	{
		is.seekg(length/threads * i);
		{
			string tmp; 
			getline(is, tmp);
		}
		res[i].first = is.tellg();
		res[i-1].second = res[i].first - res[i-1].first;
	}
	res.back().second = length - res.back().first;

	is.close();
	/*
	for(auto x:res)
		printf("%lld, %lld %lld\n",x.first,x.second,x.first+x.second);
	*/
	return res;
}

3. 计时

使用多线程前需要注意一点：clock()统计的时间是所有cpu核心经过的时间单元数之和，会导致计算出的时间会远高于单线程耗时，如果需要计时，可以使用<chrono>头文件中的steady_clock类。

<chrono>中的三个clock类：

steady_clock，专用于计算时间间隔
system_clock，系统范围的实时clock，可以用来进行时间或日期的转换
high_resolution_clock，高精度计时

#include <chrono>
using namespace std::chrono;

steady_clock::time_point t1 = steady_clock::now();
// do something
steady_clock::time_point t2 = steady_clock::now();

double t = duration_cast<duration<double>>(t2 - t1).count();
printf("time_cost = %.2fs\n",t );

还有一种方式是，假设编译后的可执行文件名字叫做test，那么在terminal当前目录下执行time ./test即可计时运行。

4. 多线程读入

每个线程需要做的事情：

创建文件对象，找到读入点
开始读入，直到读入量
统计总读入量
为了方便调试，中间输出提示信息

下面的代码中函数规定每个线程所做的事情，参数分别是线程id、起始位置、读入量；利用全局的cnt数组来保存每个线程的读入量。

long long cnt[200];
void thread_read_file(int id, long long start_point, long long volume)
{
	ifstream is(file_path);
	is.seekg(start_point);
	if(!is)
	{
		printf("file error\n");
		exit(1);
	}

	string url;
	while(volume>0 && getline(is, url))
	{
		//do_some_thing(url);
		volume -= url.size() + 1;

		if((volume & 1048575) == 1048575) printf("%d %lld\n",id,volume );
		++cnt[id];
	}
}

注意，如果使用一个变量而非数组来保存总读入量（同时被每个线程++），那么最后的cnt值会少很多。

想要解决这个问题，可以使用锁机制，不在这里提及。

设置线程

vector<std::thread> vec_thread;
for(int i=0; i<threads; ++i)
{
	std::thread th(thread_read_file, i, read_point[i].first, read_point[i].second);
	vec_thread.push_back(std::move(th));
}
for(auto &th:vec_thread)
	th.join();

5. 代码汇总

#include <iostream>    
#include <fstream>
#include <string>
#include <thread>
#include <vector> 
#include <numeric>
#include <chrono>

using namespace std;

const int threads = 8;
const string file_path = "../data/data1gb_rand.txt";

long long cnt[200];
void thread_read_file(int id, long long start_point, long long volume)
{
	ifstream is(file_path);
	is.seekg(start_point);
	if(!is)
	{
		printf("file error\n");
		exit(1);
	}

	string url;
	while(volume>0 && getline(is, url))
	{
		//do_some_thing(url);
		volume -= url.size() + 1;

		if((volume & 1048575) == 1048575) printf("%d %lld\n",id,volume );
		++cnt[id];
	}
}

vector<pair<long long, long long>> get_read_point()
{
	ifstream is(file_path);
	if(!is)
	{
		printf("file error\n");
		exit(1);
	}
	is.seekg(0, is.end);
	long long length = is.tellg();

	vector<pair<long long, long long>> res(threads);
	res[0].first = 0;
	for(int i=1; i<threads; ++i)
	{
		is.seekg(length/threads * i);
		{
			string tmp; 
			getline(is, tmp);
		}
		res[i].first = is.tellg();
		res[i-1].second = res[i].first - res[i-1].first;
	}
	res.back().second = length - res.back().first;

	is.close();
	
	for(auto x:res)
		printf("%lld, %lld %lld\n",x.first,x.second,x.first+x.second);
	
	return res;
}


int main(void)
{
	using namespace std::chrono;
	steady_clock::time_point t1 = steady_clock::now();

	vector<pair<long long, long long>> read_point = get_read_point();

	vector<std::thread> vec_thread;
	for(int i=0; i<threads; ++i)
	{
		std::thread th(thread_read_file, i, read_point[i].first, read_point[i].second);
		vec_thread.push_back(std::move(th));
	}
	for(auto &th:vec_thread)
		th.join();

	printf("%lld\n",accumulate(cnt,cnt+threads,0ll) );

	double t = duration_cast<duration<double>>(steady_clock::now() - t1).count();

	printf("time_cost = %.2fs\n",t );

	return 0;
}

Little_Fall

发布了375 篇原创文章 · 获赞 305 · 访问量 7万+

私信关注