hadoop入门程序，wordcount本地版实现

实验环境
javaee
windows

步骤：
1.先在d盘建一个test文件夹，里面新建一个文本文档随意输入几个单词

hello	world	tom	hello	mading	world	mading	mading asdfasdfasdfasdfasdf

然后保存，再多复制几个这个文件

2.单个文件的词频统计代码

package mrpro924;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/*
 * 统计一个文件中单词的词频
 */
public class WordCountJava {
	public static void main(String[] args) throws IOException {
		//创建输入流进行文件读取
		BufferedReader br = new BufferedReader(new FileReader("D:\\test\\word01.txt"));
		//读取出来的单词放在map中，key是单词，value是词频
		Map<String,Integer> map = new HashMap<>();
		//进行读取
		String line = null;
		while((line=br.readLine())!=null){
			String[] split = line.split("\t");
			//遍历插入map
			for(String s:split){
				//单词没有放入过，也就是单词第一次出现
				if(!map.containsKey(s)){
					map.put(s, 1);
				}else{
					//如果单词出现过，取出之前的次数进行加1
					int newvalue = map.get(s)+1;
					map.put(s, newvalue);
				}
			}
		}
		System.out.println(map);
	}
}

得到输出结果

{world=2, tom=1, mading=3, hello=2}

3.多个文件的词频统计

package mrpro924;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/*
 * 统计多个文件的思路，统计每一个文件，然后进行汇总，这也是mapreduce的思想
 */
public class localFilesWordCount {
	//将统计单个文件单词次数的方法抽取出来
	public static Map<String,Integer> singleWordCount(String path) throws IOException{
		
		//创建输入流进行文件读取
		BufferedReader br = new BufferedReader(new FileReader(path));
		//读取出来的单词放在map中，key是单词，value是词频
		Map<String,Integer> map = new HashMap<>();
		//进行读取
		String line = null;
		while((line=br.readLine())!=null){
			String[] split = line.split("\t");
			//遍历插入map
			for(String s:split){
				//单词没有放入过，也就是单词第一次出现
				if(!map.containsKey(s)){
					map.put(s, 1);
				}else{
					//如果单词出现过，取出之前的次数进行加1
					int newvalue = map.get(s)+1;
					map.put(s, newvalue);
				}
			}
		}
		return map;	
	}

	//计算词频统计结果
	private static Map<String,Integer> mergeResult(Map<String, Integer> ...maps) {
		//创建一个map来收集结果
		Map<String,Integer> resmap = new HashMap<>();
		//遍历map的集合
		for(Map<String,Integer> m:maps ){
			//遍历每个map的key
			for(String s:m.keySet()){
				//如果resmap没有这个词，就添加进去，
				if(!resmap.containsKey(s)){
					int value = m.get(s);
					resmap.put(s, value);
				}else{		
					//新的次数等于当前map的value+当前resmap相同key的value
					int newvalue = m.get(s)+resmap.get(s);
					resmap.put(s, newvalue);
				}
			}
			
		}
		return resmap;
	}
	
	//统计文档中哪个单词最长
	public static String LongWord(Map<String,Integer> ...maps){
		Map<String,Integer> resmap = new HashMap<>();
		List<Integer> l= new ArrayList<>();
		String res = "";
		for(Map<String,Integer> m:maps){
			for(String s:m.keySet()){
				resmap.put(s, s.length());
				l.add(s.length());
			}
		}
		int max=0;
		for(int i:l){
			if(i>max){
				max=i;
			}
		}
		for(String s:resmap.keySet()){
			if(resmap.get(s)==max){
				res = s+"的长度最长是:"+max;
			}
		}
		return res;
		
	}
	
	//统计哪个文档中的单词最多
	public static String ManyWords(Map<String,Integer> ...maps){
		List<Integer> l= new ArrayList<>();
		String s="";
		for(Map<String,Integer> m:maps){			
			l.add(m.size());
		}
		int max=0;
		for(int i:l){
			if(i>max){
				max=i;
			}
		}
		s="单词最多的个数是:"+max;
		return s;
	}
	
	public static void main(String[] args) throws IOException {
		Map<String, Integer> map1 = singleWordCount("D:\\test\\word01.txt");
		Map<String, Integer> map2 = singleWordCount("D:\\test\\word02.txt");
		Map<String, Integer> map3 = singleWordCount("D:\\test\\word03.txt");
		//为了统计结果方便，我们再写一个mergeResult的方法来进行汇总
		//统计所有文件中的单词和单词出现的次数
		Map<String, Integer> mergeResult = mergeResult(map1,map2,map3);
		System.out.println(mergeResult);
		String longWord = LongWord(map1,map2,map3);
		System.out.println(longWord);
		String manyWords = ManyWords(map1,map2,map3);
		System.out.println(manyWords);
	}
}

这段代码可以引申出MapReduce的基本思想
其中localFilesWordCount相当于map
mergeResult，LongWord，ManyWords这几个方法相当于reduce
其中思想就是将大任务切分成小任务进行计算后汇总
这种解决问题的思路封装成一个框架就是MapReduce

hadoop入门程序，wordcount本地版实现

猜你喜欢