磁盘排序(外排序)

当参加排序的数的量太大,或内存不足以存放时,需要使用外排序。外排序可以使用插入排序的思想,也可以用归并排序的思想。
下面是自己实现的归并排序思想的外排序,虽然基本做到了正确排序,且对内存的占用可以控制,但时间效率略低。

代码示例

GenerateRandomNumber.h文件,生成随机数序列。

#pragma once
#include <random>
#include  <time.h>
#include <fstream>
#include <iostream>
using namespace std;

bool	GenerateRandom(int num = 100, string fileName = "")
{
	bool flag = fileName != "";
	ofstream outfile;
	try
	{
		if (flag)
		{
			outfile.open(fileName);
			if (!outfile)
				throw "随机数输出文件创建失败!";
		}

		srand((unsigned int)time(NULL));

		while (num--)
		{
			uint32_t n = rand();
			if (flag)
				outfile << n << endl;
			cout << n << endl;
		}
		outfile.close();
	}
	catch (const char* msg)
	{
		cout << msg << endl;
		return false;
	}
	return true;
}

DiskSort.h磁盘排序本体了

#pragma once
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <queue>
#include <cstdio>
using namespace std;

#define MemoryLimit 1000 //Byte

struct File
{
	string filepath;
	int currentValue;
	int fileIndex, totalCnt, currentPos;
	bool operator<(const File& f) const
	{
		return currentValue > f.currentValue;
	}
};

void PrintFile(vector<int> num, string& path)
{
	ofstream out;
	out.open(path);
	for (auto n : num)
		out << n << endl;
	out.close();
}

void seek_to_line(File file, char* num_str)
{
	ifstream in;
	in.open(file.filepath);
	int line = file.currentPos;
	++line;
	while (line--)
	{
		in.getline(num_str, 100);
	}
	in.close();
	return;
}

string DiskSort(string infile_path)
{
	int total=0, cnt = 0;
	auto found = infile_path.find_last_of('\\');
	string outfile_path = infile_path.substr(0, found + 1) + "sorted.txt";
	uint32_t maxCnt = MemoryLimit / sizeof(int32_t);
	vector<int32_t> memN(maxCnt);//存放读入内存的数据
	vector<File> tmpfiles;

	ifstream infile; ofstream outfile;
	try
	{
		infile.open(infile_path);
		if (!infile)
			throw "输入文件打开失败!";
		char num_str[100];
		int num_cnt = 0, file_cnt = 0;
		string tmpfile_path = infile_path.substr(0, found + 1) + "tmpout";
		while (infile.getline(num_str, 100))
		{
			memN[num_cnt++] = stoi(num_str);
			if (num_cnt == maxCnt)//达到内存限制
			{
				total += num_cnt;
				sort(memN.begin(), memN.begin() + num_cnt);
				string tmp = tmpfile_path + to_string(file_cnt) + ".txt";
				PrintFile(memN, tmp);
				tmpfiles.push_back({ tmp,0,file_cnt++,num_cnt,0 });
				num_cnt = 0;
			}
		}
		infile.close();
		if (num_cnt > 0)
		{
			total += num_cnt;
			sort(memN.begin(), memN.begin() + num_cnt);
			string tmp = tmpfile_path + to_string(file_cnt);
			PrintFile(memN, tmp);
			tmpfiles.push_back({ tmp,0,file_cnt++,num_cnt,0 });
		}

		priority_queue<File> pq;
		ofstream outfile;
		outfile.open(outfile_path);
		for (auto f : tmpfiles)
		{
			seek_to_line(f, num_str);
			if (strcmp(num_str, "") != 0)
			{
				f.currentValue = stoi(num_str);
				++f.currentPos;
				pq.push(f);
			}
		}
		int prev_value = INT_MIN;
		bool sorted = true;
		while (pq.size() > 0)
		{
			++cnt;
			auto f = pq.top(); pq.pop();
			sorted = prev_value <= f.currentValue;
			prev_value = f.currentValue;
			outfile << f.currentValue << endl;
			cout << f.currentValue << endl;
			if (f.currentPos == f.totalCnt)//该文件已读取完
			{
				remove(f.filepath.c_str());
				continue;
			}
				
			seek_to_line(f, num_str);
			if (strcmp(num_str, "") != 0)
			{
				f.currentValue = stoi(num_str);
				++f.currentPos;
				pq.push(f);
			}
		}
		outfile.close();
		sorted = cnt == total;
		cout << "排序完成" << endl;
		cout << "共读取:" << total << ";" << "被排序:" << cnt << endl;
		cout << "正确性:";
		if (sorted) cout << "正确" << endl;
		else cout << "错误" << endl;
	}
	catch (const char* msg)
	{
		cout << msg << endl;
	}
	return outfile_path;
}

main文件

#include <iostream>
#include <ctime>
#include <string>
#include "../DiskSort/GenerateRandomNumber.h"
#include "../DiskSort/DiskSort.h"
using namespace std;

int main()
{
	int  cnt = 1000000;
	string filepath = "C:\\WorkSpace\\CPP\\DiskSort\\DiskSort\\randomlist.txt";
	if (GenerateRandom(cnt, filepath))
	{
		cout << "生成成功!" << endl;
		time_t start, end;
		start = clock();
		DiskSort(filepath);
		end = clock();
		cout << (double)(end - start) / CLOCKS_PER_SEC  << "s"<< endl;
	}
	else
		cout << "生成失败!" << endl;

	return 0;
}

猜你喜欢

转载自blog.csdn.net/EasonDongH/article/details/87805032