单机mapreduce:
package com.test.mryinru;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class MySingleWordCount {
public static void main(String[] args) throws Exception {
FileSystem fs = FileSystem.get(new URI("hdfs://marshal:9000"),new Configuration(),"root");
FSDataInputStream in = fs.open(new Path("/wordcount/input/a.txt"));
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String line=null;
HashMap<String,Integer> map = new HashMap<>();
//按行读文件
while((line = br.readLine())!=null){
//切分单词
String[] words = line.split(" ");
//用hashMap做单词计数
for (String word : words) {
if(map.containsKey(word)){
map.put(word, map.get(word)+1);
}
else{
map.put(word,1);
}
}
}
br.close();
in.close();
fs.mkdirs(new Path("/wordcount/output/"));
FSDataOutputStream out = fs.create(new Path("/wordcount/output/result.txt"));
//a,3
//b,4
Set<Entry<String, Integer>> entrySet = map.entrySet();
for (Entry<String, Integer> entry : entrySet) {
out.write((entry.getKey()+","+entry.getValue()).getBytes());
out.write("\n".getBytes());
}
out.close();
fs.close();
}
}
模拟分布式mapreduce:
package com.test.mryinru;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class MyDistributedWordCountMapTask {
public static void main(String[] args) throws Exception {
int taskId = Integer.parseInt(args[0]);
String file = args[1];
long startOffset = Long.parseLong(args[2]);
long length = Long.parseLong(args[3]);
FileSystem fs = FileSystem.get(new URI("hdfs://marshal:9000"),new Configuration(),"root");
FSDataInputStream in = fs.open(new Path(file));
BufferedReader br = new BufferedReader(new InputStreamReader(in));
//定位到自己所负责的数据片的位置
in.seek(startOffset);
String line = null;
long count = 0;
FSDataOutputStream tmpOut_0 = fs.create(new Path("/wordcount/tmp/part-m-"+taskId+"-0"));
FSDataOutputStream tmpOut_1 = fs.create(new Path("/wordcount/tmp/part-m-"+taskId+"-1"));
/**
* 如果不是编号最小的一个task实例,则忽略数据片的第一行
*/
if(taskId!=0){
br.readLine();
}
while((line = br.readLine())!=null){
String[] words = line.split(" ");
for (String word : words) {
if(word.hashCode()%2==0){
tmpOut_0.write((word+"\t"+1+"\n").getBytes());
}
else{
tmpOut_1.write((word+"\t"+1+"\n").getBytes());
}
}
//如果使用readLine的话,会少计回车符,所以length还要加一
count += line.length()+1;
/**
* 总是超过界限多读一行
*/
if(count > length){
break;
}
}
in.close();
br.close();
tmpOut_0.close();
tmpOut_1.close();
fs.close();
}
}
package com.test.mryinru;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
public class MyDistributedWordCountReduceTask {
public static void main(String[] args) throws Exception {
//根据自己的编号,去取map阶段所产生的编号文件
int taskId = Integer.parseInt(args[0]);
FileSystem fs = FileSystem.get(new URI("hdfs://marshal:9000"),new Configuration(),"root");
//第二个参数为是否递归,查看临时目录中有哪些中间结果文件
RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/wordcount/tmp/"), false);
HashMap<String, Integer> map = new HashMap<String,Integer> ();
//迭代临时目录中的所有文件
while(files.hasNext()){
LocatedFileStatus file = files.next();
//判断文件名是否以自己的taskid结尾
if(file.getPath().getName().endsWith(taskId+"")){
FSDataInputStream in = fs.open(file.getPath());
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String line = null;
while((line = br.readLine())!= null){
String[] split = line.split("\t");
//将中间结果的单词次数进行累加
if(map.containsKey(split[0])){
map.put(split[0], map.get(split[0])+1);
}
else{
map.put(split[0],1);
}
}
in.close();
br.close();
}
}
//输出最终结果
FSDataOutputStream out = fs.create(new Path("/wordcount/output/part-r-"+taskId));
Set<Entry<String, Integer>> entrySet = map.entrySet();
for (Entry<String, Integer> entry : entrySet) {
out.write((entry.getKey()+entry.getValue()+"\n").getBytes());
}
out.close();
fs.close();
}
}