相比较与直接使用mapreduce,java模拟无疑复杂很多。但对初学者理解mapreduce又较深的意义,也可加强java基础
代码相对比较基础简单,就不添加注释了
几个要点 map的模拟多机器读取,每个机器读取多少,存放位置,第二台往上的机器跳行的原因,一台机器读取半行时的处理方式
map模拟
package LogsToHaDoop;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class MapTask {
public static void main(String[] args) throws Exception {
/**
* taskId 机器标识 startOfSet 从哪里读取数据
* startOfSet 从哪里读
* 每台机器读多少
*/
int taskId = Integer.parseInt(args[0]);
String file = args[1];
long startOfSet = Long.parseLong(args[2]);
long length = Long.parseLong(args[3]);
FileSystem fs = FileSystem.get(new URI("hdfs://jiqun01:9000"), new Configuration(), "root");
FSDataInputStream inputStream = fs.open(new Path(file));
// 创建输出文件
FSDataOutputStream out_tmp_1 = fs.create(new Path("/wordCountOne-"+taskId+"-1"));
FSDataOutputStream out_tmp_2 = fs.create(new Path("/wordCountTwo-"+taskId+"-2"));
// 定位从哪里读
inputStream.seek(startOfSet);
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
if(taskId!=1) {
br.readLine();
}
long count = 0;
String line = null;
while ((line = br.readLine()) != null) {
String[] split = line.split("\\s+");
for (String word : split) {
if (word.hashCode() % 2 == 0) {
out_tmp_1.write((word + "\t" + 1 + "\n").getBytes());
} else {
out_tmp_2.write((word + "\t" + 1 + "\n").getBytes());
}
}
//linux回车长度为1
count += line.length() + 1;
if (count > length) {
break;
}
}
}
}
reduce模拟
package LogsToHaDoop;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
public class ReduceTask {
public static void main(String[] args) throws Exception {
int taskId=Integer.parseInt(args[0]);
Map<String,Integer> map=new HashMap<>();
FileSystem fs=FileSystem.get(new URI("hdfs://jiqun01:9000"),new Configuration(),"root");
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
while(listFiles.hasNext()) {
LocatedFileStatus file = listFiles.next();
if(file.getPath().getName().endsWith("-"+taskId)) {
FSDataInputStream inputStream = fs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
String line=null;
while((line=br.readLine())!=null) {
String[] split = line.split("\t");
Integer count = map.getOrDefault(split[0], 0);
count+=Integer.parseInt(split[1]);
map.put(split[0],count);
}
br.close();
inputStream.close();
}
}
FSDataOutputStream outputStream = fs.create(new Path("/wordCountResult-"+taskId));
Set<Entry<String, Integer>> entrySet = map.entrySet();
for (Entry<String, Integer> entry : entrySet) {
outputStream.write((entry.getKey()+"="+entry.getValue()+"\n").getBytes());
}
outputStream.close();
fs.close();
}
}
运行结果截图