(12) Hadoop Java 实现MapReduce HelloWord 单词统计更新版

package com.my.hadoop.hadoophdfs.mapreduce;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//文件内容 类似
/*
 * aa  vv  bb
 *  tt  yy jj 
 *  ss dd gg 
 *  hh
 *  ff
 *  ddd dd
 * */
/**
 *    单词统计类
 * @author liming
 *
 */
public class WordCountMapReduce extends Configured implements Tool {
		/**
		 * Map
		 * Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> 
		 * @author liming
		 *
		 */
		public static class WorldCountMapper extends Mapper<LongWritable ,Text, Text, IntWritable>{
				private Text mapOutputKey= new Text();
				private final static IntWritable  mapOutPutValue =new IntWritable(1);
			@Override
			public void map(LongWritable key, Text value, Context context)
					throws IOException, InterruptedException {
				//TODO
				String lineValue = value.toString();
				//分割  lineValue.split(" ") 可以这样写  但是不好 吃内存
				StringTokenizer stringTokenizer = new StringTokenizer(lineValue);
				//迭代 拿数据
				while(stringTokenizer.hasMoreElements()){
					String wordVal = stringTokenizer.nextToken();
					//变成key - value 对
					//map输出的key
					 mapOutputKey.set(wordVal);
					//mapOutPutValue =1
					//map输出
					context.write(mapOutputKey, mapOutPutValue);
				}
				
			}
			
		}
		/**
		 * Reduce 
		 * Map的类型 就是 Reduce的输入类型
		 * 
		 * @author liming
		 *
		 */
		public static class WorldCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
			//reduce输出
			private IntWritable outPutVal= new IntWritable();
			
			protected void reduce(Text key, Iterable<IntWritable> values,
				 	 Context content)
					throws IOException, InterruptedException {
				
				//TODO
				/*map的输出类似于：
				 * <aa,1>
				 * <bb,1>
				 * <jj,1>
				 * <aa,1>
				 * 
				 * map--->shuffle-->reduce 这个过程很复杂  也就是将相同key的value放到一个集合中 ----> <aa,list(1,1)>
				 *  这叫做分组group
				 * 
				 * */
				
				//统计和  比如：<aa,list(1,1) 结果是1+1=2 最后输出就是<aa,2>
				int sum =0;
				for(IntWritable value:values){
					sum+=value.get();
				}
				outPutVal.set(sum);
				
				//reduce输出
				content.write(key, outPutVal);
				
			}
		 
		}
		
	/**
	 * Driver
	 */
	// run 是 Tool中的方法
	public int run(String[] args) throws Exception {
		// 获取configuration 从继承的Configured类中获取
		Configuration cf = getConf();
		// 创建job
		try {
			// 配置文件 job名称
			Job job = Job.getInstance(cf, this.getClass().getSimpleName());
			// 设置运行类的类型
			job.setJarByClass(this.getClass());

			/**** input ******/
			// input map reduce output 串起来
			Path inPath = new Path(args[0]);
			FileInputFormat.addInputPath(job, inPath);
			/**** map ******/
			// map 方法类型嗯
			job.setMapperClass(WorldCountMapper.class);
			// map 输出key value 类型
			job.setMapOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			/**** reduce ******/
			// reduce类型
			job.setReducerClass(WorldCountReducer.class);
			// reduce 输出 也就是job输出
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			/**** output ******/
			Path outPath = new Path(args[1]);
			FileOutputFormat.setOutputPath(job, outPath);
			/**** 提交job ******/
			// 返回布尔类型 这里设置true是打印日志信息 设置false是不打印日志
			boolean isSucc = job.waitForCompletion(true);

			return isSucc ? 0 : 1;
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return 1;
	}
		
		public static void main(String[] args) throws Exception {
			//运行
			//int status = new WordCountMapReduce().run(args);
			Configuration  conf = new Configuration();
			//在这个里边 设置了传递的参数conf  然后在run方法中获取 都是父类的方法
			int status=	ToolRunner.run(conf, new WordCountMapReduce(), args);
			
			System.exit(status);
			
		}

			
}

程序模板：带TODO的地方是开发时需要修改的地方

package com.my.hadoop.hadoophdfs.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 *    单词统计类
 * @author liming
 *
 */
public class ModuleMapReduce extends Configured implements Tool {
		/**
		 * TODO Map  开发时修改四个参数
		 * @author liming
		 *
		 */
		public static class ModuleMapper extends Mapper<LongWritable ,Text, Text, IntWritable>{
				public void map(LongWritable key, Text value, Context context)
					throws IOException, InterruptedException {
					//TODO 实现业务逻辑
			}
			
		}
		/**
		 * TODO Reduce  开发时修改四个参数
		 * @author liming
		 *
		 */
		public static class ModuleReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
			protected void reduce(Text key, Iterable<IntWritable> values,
				 	 Context content)
					throws IOException, InterruptedException {
				//TODO 业务逻辑
			}
		 
		}
		
	/**
	 * Driver
	 */
	// run 是 Tool中的方法
	public int run(String[] args) throws Exception {
		// 获取configuration 从继承的Configured类中获取
		Configuration cf = getConf();
		// 创建job
		try {
			// 配置文件 job名称
			Job job = Job.getInstance(cf, this.getClass().getSimpleName());
			// 设置运行类的类型
			job.setJarByClass(this.getClass());

			/**** input ******/
			// input map reduce output 串起来
			Path inPath = new Path(args[0]);
			FileInputFormat.addInputPath(job, inPath);
			/**** map ******/
			// map 方法类型嗯
			job.setMapperClass(ModuleMapper.class);
			// map 输出key value 类型
			job.setMapOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			/**** reduce ******/
			// reduce类型
			job.setReducerClass(ModuleReducer.class);

			// TODO reduce 输出 也就是job输出的类型 开发时需要修改
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			/**** output ******/
			Path outPath = new Path(args[1]);
			FileOutputFormat.setOutputPath(job, outPath);
			/**** 提交job ******/
			// 返回布尔类型 这里设置true是打印日志信息 设置false是不打印日志
			boolean isSucc = job.waitForCompletion(true);

			return isSucc ? 0 : 1;
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		return 1;
	}
		
		public static void main(String[] args) throws Exception {
			//运行
			Configuration  conf = new Configuration();
			//在这个里边 设置了传递的参数conf  然后在run方法中获取 都是父类的方法
			int status=	ToolRunner.run(conf, new WordCountMapReduce(), args);
			System.exit(status);
			
		}

			
}

(12) Hadoop Java 实现MapReduce HelloWord 单词统计 更新版

程序模板：带TODO的地方是开发时需要修改的地方

猜你喜欢

(12) Hadoop Java 实现MapReduce HelloWord 单词统计更新版