MapReduce编程小案例.11th—数据倾斜场景
数据:
a a a a a a b b b a a a a a a a c c b c a a a c a b b c a a d d e e f f f g a a a b a b h h g j |
需求:
需要做wordcount
但是,会有一个问题存在:
a特别多,
负责处理a这个单词数据的reduce worker就会很累(负载不均衡,过大)
思考:如何处理?会让整个数据处理过程中,数据倾斜的状况得到缓解。
数据倾斜场景part1-解决代码方法:
WordcountCombiner类实现
package cn.edu360.mr.wc; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class WordcountCombiner extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value : values) { count += value.get(); } context.write(key, new IntWritable(count)); } }
WordcountMapper类实现
package cn.edu360.mr.wc; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; /* * KEYIN:是map task读取到的数据的key的类型;是一行的起始偏移量Long * VALUE:是map task读取的数据value的类型:是一行的内容String * * KEYOUT:是用户的自定义map方法返回的结果kv数据的key的类型;在wordcount逻辑中,我们需要返回的是单词String * VALUEOUT:是用户自定义map方法返回结果kv的value的类型:在wordcount逻辑中,我们需要返回的是整数Integer * * 但是:在MapReduce中,map产生的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中的原生序列化机制产生的数据量比较冗余,就会导致数据在MapReduce运行过程中传输效率低下 * 所以,hadoop专门设计了自己的序列化机制,那么MapReduce中传输的数据类型就必须实现hadoop自己的序列化接口 * * hadoop为jdk中的常用基本类型Long,String,Integer,Float等数据类型封装了自己实现的hadoop序列化接口的类型:LongWriable,Text,Intwritable,Floatwritable * * */ public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //切单词 String line = value.toString(); String[] words = line.split(" "); for(String word : words) { context.write(new Text(word), new IntWritable(1)); } } }
WordcountReducer类
package cn.edu360.mr.wc; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException { int count = 0; Iterator<IntWritable> iterator = values.iterator(); while(iterator.hasNext()){ IntWritable value = iterator.next(); count += value.get(); } context.write(key, new IntWritable(count)); } }
JobSubmitterWindowsLocal类实现
package cn.edu360.mr.wc; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class JobSubmitterWindowsLocal { public static void main(String[] args) throws Exception{ //没指定默认文件系统 //没指定MapReduce job提交到哪里进行 Configuration conf = new Configuration(); conf.set("fs.defaultFS", "file:///"); conf.set("mapreduce.framework.name", "local"); Job job = Job.getInstance(conf); job.setJarByClass(JobSubmitterLinuxToYarn.class); job.setMapperClass(WordcountMapper.class); job.setReducerClass(WordcountReducer.class); job.setMapOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //设置maptask端的局部聚合逻辑类 job.setCombinerClass(WordcountCombiner.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job,new Path("f:/mrdata/wordcount/input")); FileOutputFormat.setOutputPath(job, new Path("f:/mrdata/wordcount/output2")); job.setNumReduceTasks(3); boolean res = job.waitForCompletion(true); System.exit(res?0:1); } }