大数据学习之路10-mapreduce思想引入

单机mapreduce:

package com.test.mryinru;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class MySingleWordCount {
    public static void main(String[] args) throws Exception {
    	FileSystem fs = FileSystem.get(new URI("hdfs://marshal:9000"),new Configuration(),"root");
    	FSDataInputStream in = fs.open(new Path("/wordcount/input/a.txt"));
    	BufferedReader br = new BufferedReader(new InputStreamReader(in));
    	String line=null;
    	HashMap<String,Integer> map = new HashMap<>();
		//按行读文件
    	while((line = br.readLine())!=null){
    		//切分单词
    		String[] words = line.split(" ");
    		//用hashMap做单词计数
    		for (String word : words) {
				if(map.containsKey(word)){
					map.put(word, map.get(word)+1);
				}
				else{
					map.put(word,1);
				}
			}
    		
    	}
    	br.close();
    	in.close();
    	fs.mkdirs(new Path("/wordcount/output/"));
    	FSDataOutputStream out = fs.create(new Path("/wordcount/output/result.txt"));
    	//a,3
    	//b,4
    	Set<Entry<String, Integer>> entrySet = map.entrySet();
    	for (Entry<String, Integer> entry : entrySet) {
			out.write((entry.getKey()+","+entry.getValue()).getBytes());
			out.write("\n".getBytes());
			
		}
    	out.close();
    	fs.close();
    	
	}
}

模拟分布式mapreduce:

package com.test.mryinru;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class MyDistributedWordCountMapTask {
        public static void main(String[] args) throws Exception {
        	int taskId = Integer.parseInt(args[0]);
			String file = args[1];
			long startOffset = Long.parseLong(args[2]);
			long length = Long.parseLong(args[3]);
			FileSystem fs = FileSystem.get(new URI("hdfs://marshal:9000"),new Configuration(),"root");
			FSDataInputStream in = fs.open(new Path(file));
			BufferedReader br = new BufferedReader(new InputStreamReader(in));
			//定位到自己所负责的数据片的位置
			in.seek(startOffset);
			String line = null;
			long count = 0;
			FSDataOutputStream tmpOut_0 = fs.create(new Path("/wordcount/tmp/part-m-"+taskId+"-0"));
			FSDataOutputStream tmpOut_1 = fs.create(new Path("/wordcount/tmp/part-m-"+taskId+"-1"));
			/**
			 * 如果不是编号最小的一个task实例，则忽略数据片的第一行
			 */
			if(taskId!=0){
				br.readLine();
			}
			while((line = br.readLine())!=null){
				
				
				String[] words = line.split(" ");
			    for (String word : words) {
					if(word.hashCode()%2==0){
						tmpOut_0.write((word+"\t"+1+"\n").getBytes());
					}
					else{
						tmpOut_1.write((word+"\t"+1+"\n").getBytes());
					}
				}
			    //如果使用readLine的话，会少计回车符，所以length还要加一
			    count += line.length()+1;
			    /**
			     * 总是超过界限多读一行
			     */
				if(count > length){
					break;
				}
				
			}
			in.close();
			br.close();
			
			tmpOut_0.close();
			tmpOut_1.close();
			fs.close();
			
		}
}

package com.test.mryinru;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

public class MyDistributedWordCountReduceTask {
           public static void main(String[] args) throws Exception {
			//根据自己的编号，去取map阶段所产生的编号文件
        	   int taskId = Integer.parseInt(args[0]);
        	   FileSystem fs = FileSystem.get(new URI("hdfs://marshal:9000"),new Configuration(),"root");
		       //第二个参数为是否递归，查看临时目录中有哪些中间结果文件
        	   RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/wordcount/tmp/"), false);
        	   HashMap<String, Integer> map = new HashMap<String,Integer> ();
        	  //迭代临时目录中的所有文件
        	   while(files.hasNext()){
        		   LocatedFileStatus file = files.next();
        		   //判断文件名是否以自己的taskid结尾
        		   if(file.getPath().getName().endsWith(taskId+"")){
        			   FSDataInputStream in = fs.open(file.getPath());
        			   BufferedReader br = new BufferedReader(new InputStreamReader(in));
        			   String line = null;
        			   while((line = br.readLine())!= null){
        				   String[] split = line.split("\t");
        				   //将中间结果的单词次数进行累加
        				   if(map.containsKey(split[0])){
        					   map.put(split[0], map.get(split[0])+1);
        				   }
        				   else{
        					   map.put(split[0],1);
        				   }
        					   
        			   }
        			   in.close();
        			   br.close();
        		    }
        	   }
        	   //输出最终结果
        	   FSDataOutputStream out = fs.create(new Path("/wordcount/output/part-r-"+taskId));
        	   Set<Entry<String, Integer>> entrySet = map.entrySet();
        	   for (Entry<String, Integer> entry : entrySet) {
				    out.write((entry.getKey()+entry.getValue()+"\n").getBytes());
			    }
                out.close();
                fs.close();
           
           }      
}

大数据学习之路10-mapreduce思想引入

猜你喜欢