只有1G内存,如何对10G的文件中数据进行排序

  思考

     1.归并、外排 2.多线程  RecursiveTask

测试生成文件代码

/***
 *@author dongsheng
 *@date 2019/1/18 22:58
 *@Description:
 *@version 1.0.0
 */
public class GenerateNumber {

	private static final String filePath="F:/file_test/data1.txt";

	public static void main(String[] args) throws IOException {
		//随机生成数据
		Random random = new Random();
		try (PrintWriter out = new PrintWriter(new File(filePath))) {
			long beginTime = System.currentTimeMillis();
			System.out.println("beginTime:"+beginTime);
			for (int i = 0; i < 1_000_000; i++) {
				out.println(random.nextInt());
				if (i % 10000 == 0)
					out.flush();
			}
			System.out.println("endTime:"+(System.currentTimeMillis()-beginTime));
		}
	}
}

排序代码

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ForkJoinPool;


/***
 *@author dongsheng
 *@date 2019/1/18 22:58
 *@Description:
 *@version 1.0.0
 */
public class SortByForkjoin {

	private static final String filePath="F:/file_test/data1.txt";
	private static final String afterFilePath="F:/file_test/data1-sort.txt";

	public static void main(String[] args) throws Exception {
		ForkJoinPool pool = new ForkJoinPool();

		/***
		 * 1.从文件中读取内存容量可处理的数据量 ,拆分为多个部分排序的文件
		 * 2.每100_000条数据为一个文件
		 * */
		int size = 100_000;
		int[] array = new int[size];
		BufferedReader reader = new BufferedReader(new FileReader(filePath));
		String line = null;
		int i = 0;
		int partition = 0;

		/***
		 * 1.拆分的部分文件名list,后面归并部分文件需要用到它
		 * 2.每size条数据,排序后,放到一个文件
		 * */
		List<String> partFiles = new ArrayList<>();
		while ((line = reader.readLine()) != null) {
			array[i++] = Integer.parseInt(line);
			if (i == size) {
				// 重置i为0
				i = 0;
				// 排序输出到部分文件
				String filename = "F:/file_test/data1-part-" + partition++ + ".txt";
				//对当前partition的数据进行排序
				doPartitionSort(pool, filename, array, 0, size);
				partFiles.add(filename);
			}
		}
		reader.close();

		// 未达到size的部分数据,排序后放一个文件
		if (i > 0) {
			// 排序输出到部分文件
			String filename = "F:/file_test/data1-part-" + partition++ + ".txt";
			doPartitionSort(pool, filename, array, 0, i);
			partFiles.add(filename);
		}

		if (partition > 1) {
			// 归并排序
			MergerFileSortTask mtask = new MergerFileSortTask(partFiles, afterFilePath);
			pool.submit(mtask);
			mtask.get();

		} else {
			// 将唯一的一个部分文件重命名为最终结果文件名
		}
		pool.shutdown();
	}

	/**
	 * partition的数据进行排序
	 * @param pool  工作池
	 * @param filename  文件名
	 * @param array  数组
	 * @param start  开始下标
	 * @param end   结束下标
	 * @throws Exception
	 */
	static void doPartitionSort(ForkJoinPool pool, String filename, int[] array, int start, int end) throws Exception {
		ArrayMergerSortTask task = new ArrayMergerSortTask(array, start, end);
		pool.submit(task);
		task.get();
		try (PrintWriter pw = new PrintWriter(filename);) {
			for (int i = start; i < end; i++) {
				pw.println(array[i]);
			}
		}
	}
}



/***
 *@author dongsheng
 *@date 2019/1/18 22:58
 *@Description:
 *@version 1.0.0
 */
public class ArrayMergerSortTask extends RecursiveAction {

	// implementation details follow:
	static final int THRESHOLD = 1000;

	final int[] array;
	final int lo, hi;

	ArrayMergerSortTask(int[] array, int lo, int hi) {
		this.array = array;
		this.lo = lo;
		this.hi = hi;
	}

	ArrayMergerSortTask(int[] array) {
		this(array, 0, array.length);
	}

	protected void compute() {
		if (hi - lo < THRESHOLD)		//小于1000,就排序
			sortSequentially(lo, hi);
		else {
			int mid = (lo + hi) >>> 1;		//大于1000,拆分
			invokeAll(new ArrayMergerSortTask(array, lo, mid),
					new ArrayMergerSortTask(array, mid, hi));
			merge(lo, mid, hi);
		}
	}

	void sortSequentially(int lo, int hi) {
		Arrays.sort(array, lo, hi);		//利用JDK自带的排序进行
	}

	void merge(int lo, int mid, int hi) {
		int[] buf = Arrays.copyOfRange(array, lo, mid);
		for (int i = 0, j = lo, k = mid; i < buf.length; j++)
			array[j] = (k == hi || buf[i] < array[k]) ? buf[i++] : array[k++];
	}

	
}


/***
 *@author dongsheng
 *@date 2019/1/18 22:58
 *@Description:
 *@version 1.0.0
 */
public class MergerFileSortTask extends RecursiveTask<String> {

	List<String> partFiles;
	int lo, hi;

	String filename;

	public MergerFileSortTask(List<String> partFiles, int lo, int hi, String filename) {
		super();
		this.partFiles = partFiles;
		this.lo = lo;
		this.hi = hi;
		this.filename = filename;
	}

	public MergerFileSortTask(List<String> partFiles, String filename) {
		this(partFiles, 0, partFiles.size(), filename);
	}

	@Override
	protected String compute() {
		// 如果要归并的文件数大于2,则fork
		int fileCount = hi - lo;
		if (fileCount > 2) {		//fileCount>2 则继续拆分
			int mid = fileCount / 2;
			MergerFileSortTask task1 = new MergerFileSortTask(partFiles, lo, lo + mid, this.filename + "-1");
			MergerFileSortTask task2 = new MergerFileSortTask(partFiles, lo + mid, hi, this.filename + "-2");

			// 任务提交forkjoinPool中去执行
			task1.fork();
			task2.fork();

			// 合并两个文件
			try {		//文件
				this.mergerFile(task1.get(), task2.get());	//合并执行结果
			} catch (IOException | InterruptedException | ExecutionException e) {

				e.printStackTrace();
			}
		} else if (fileCount == 2) {		//文件个数为2,合并文件
			// 合并两个文件
			try {
				this.mergerFile(this.partFiles.get(lo), this.partFiles.get(lo + 1));
			} catch (IOException e) {

				e.printStackTrace();
			}
		} else if (fileCount == 1) {
			return this.partFiles.get(lo);
		}
		return this.filename;
	}

	private void mergerFile(String f1, String f2) throws IOException {
		try (BufferedReader reader1 = new BufferedReader(new FileReader(f1));
				BufferedReader reader2 = new BufferedReader(new FileReader(f2));
				PrintWriter pw = new PrintWriter(filename);) {

			String s1 = reader1.readLine(), s2 = reader2.readLine();
			Integer d1 = s1 == null ? null : Integer.valueOf(s1);
			Integer d2 = s2 == null ? null : Integer.valueOf(s2);
			while (true) {
				if (s1 == null) {		//s1为null,直接把2写完
					// 直接读取reader2,写
					while (s2 != null) {
						pw.println(s2);
						s2 = reader2.readLine();
					}
				} else if (s2 == null) {  //s2为null,直接把1写完
					// 直接读取reader1,写
					while (s1 != null) {
						pw.println(s1);
						s1 = reader1.readLine();
					}
				} else {

					// 比较两个值
					while (d1 <= d2 && s1 != null) {
						// 写s1, 继续读reader1
						pw.println(s1);
						s1 = reader1.readLine();
						if (s1 != null) {
							d1 = Integer.valueOf(s1);
						}
					}

					while (d1 > d2 && s2 != null) {
						// 写s2, 继续读reader2
						pw.println(s2);
						s2 = reader2.readLine();
						if (s2 != null) {
							d2 = Integer.valueOf(s2);
						}
					}

				}

				if (s1 == null && s2 == null) // 都读完了
					break;
			}
		}
	}
}
RecursiveTask

      RecusiveTaskRecusiveAction相似,只不过每个子任务处理之后会带一个返回值,最终所有的子任务的返回结果会join(合并)成一个结果.

源代码

/*
 *
 *
 *
 *
 *
 * Written by Doug Lea with assistance from members of JCP JSR-166
 * Expert Group and released to the public domain, as explained at
 * http://creativecommons.org/publicdomain/zero/1.0/
 */

package java.util.concurrent;

/**
 * A recursive result-bearing {@link ForkJoinTask}.
 *
 * <p>For a classic example, here is a task computing Fibonacci numbers:
     斐波那契
 *
 *  <pre> {@code
 * class Fibonacci extends RecursiveTask<Integer> {
 *   final int n;
 *   Fibonacci(int n) { this.n = n; }
 *   Integer compute() {
 *     if (n <= 1)
 *       return n;
 *     Fibonacci f1 = new Fibonacci(n - 1);
 *     f1.fork();
 *     Fibonacci f2 = new Fibonacci(n - 2);
 *     return f2.compute() + f1.join();
 *   }
 * }}</pre>
 *
 * However, besides being a dumb way to compute Fibonacci functions
 * (there is a simple fast linear algorithm that you'd use in
 * practice), this is likely to perform poorly because the smallest
 * subtasks are too small to be worthwhile splitting up. Instead, as
 * is the case for nearly all fork/join applications, you'd pick some
 * minimum granularity size (for example 10 here) for which you always
 * sequentially solve rather than subdividing.
 *
 * @since 1.7
 * @author Doug Lea
 */
public abstract class RecursiveTask<V> extends ForkJoinTask<V> {
    private static final long serialVersionUID = 5232453952276485270L;

    /**
     * The result of the computation.
     */
    V result;

    /**
     * The main computation performed by this task.
     * @return the result of the computation
     */
    protected abstract V compute();

    public final V getRawResult() {
        return result;
    }

    protected final void setRawResult(V value) {
        result = value;
    }

    /**
     * Implements execution conventions for RecursiveTask.
     */
    protected final boolean exec() {
        result = compute();
        return true;
    }

}
发布了121 篇原创文章 · 获赞 60 · 访问量 36万+

猜你喜欢

转载自blog.csdn.net/sdmxdzb/article/details/104409250