HaDoop Java模拟MapReduce

相比较与直接使用mapreduce，java模拟无疑复杂很多。但对初学者理解mapreduce又较深的意义，也可加强java基础

代码相对比较基础简单，就不添加注释了

几个要点 map的模拟多机器读取，每个机器读取多少，存放位置，第二台往上的机器跳行的原因，一台机器读取半行时的处理方式

map模拟

package LogsToHaDoop;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class MapTask {
	public static void main(String[] args) throws Exception {
		/**
		 * taskId 机器标识 startOfSet 从哪里读取数据
		 * startOfSet 从哪里读
		 * 每台机器读多少
		 */
		int taskId = Integer.parseInt(args[0]);
		String file = args[1];
		long startOfSet = Long.parseLong(args[2]);
		long length = Long.parseLong(args[3]);

		FileSystem fs = FileSystem.get(new URI("hdfs://jiqun01:9000"), new Configuration(), "root");
		FSDataInputStream inputStream = fs.open(new Path(file));
		// 创建输出文件
		FSDataOutputStream out_tmp_1 = fs.create(new Path("/wordCountOne-"+taskId+"-1"));
		FSDataOutputStream out_tmp_2 = fs.create(new Path("/wordCountTwo-"+taskId+"-2"));
		// 定位从哪里读
		inputStream.seek(startOfSet);
		BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
        if(taskId!=1) {
        	br.readLine();
        }
		long count = 0;
		String line = null;
		while ((line = br.readLine()) != null) {
			String[] split = line.split("\\s+");
			for (String word : split) {
				if (word.hashCode() % 2 == 0) {
					out_tmp_1.write((word + "\t" + 1 + "\n").getBytes());
				} else {
					out_tmp_2.write((word + "\t" + 1 + "\n").getBytes());
				}

			}
			//linux回车长度为1
			count += line.length() + 1;
			if (count > length) {
				break;
			}
		}
	}
}

reduce模拟

package LogsToHaDoop;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

public class ReduceTask {
	public static void main(String[] args) throws Exception {
		int taskId=Integer.parseInt(args[0]);
		Map<String,Integer> map=new HashMap<>();
		FileSystem fs=FileSystem.get(new URI("hdfs://jiqun01:9000"),new Configuration(),"root");
		RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
		while(listFiles.hasNext()) {
			LocatedFileStatus file = listFiles.next();
			if(file.getPath().getName().endsWith("-"+taskId)) {
				FSDataInputStream inputStream = fs.open(file.getPath());
				BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
				String line=null;
				while((line=br.readLine())!=null) {
					String[] split = line.split("\t");
					Integer count = map.getOrDefault(split[0], 0);
					count+=Integer.parseInt(split[1]);
					map.put(split[0],count);
				}
				br.close();
				inputStream.close();
			}
		}
		FSDataOutputStream outputStream = fs.create(new Path("/wordCountResult-"+taskId));
		Set<Entry<String, Integer>> entrySet = map.entrySet();
		for (Entry<String, Integer> entry : entrySet) {
			outputStream.write((entry.getKey()+"="+entry.getValue()+"\n").getBytes());
		}
		outputStream.close();
		fs.close();
	}

}

运行结果截图

HaDoop Java模拟MapReduce

代码相对比较基础简单，就不添加注释了

几个要点 map的模拟多机器读取，每个机器读取多少，存放位置，第二台往上的机器跳行的原因，一台机器读取半行时的处理方式

猜你喜欢