MapReduce编程小案例.8th—替换默认的文本输入输出组件为sequence文件输入输出组件
实现代码:
同样分2步实现
package cn.edu360.mr.index.sequence; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class IndexStepOne { public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ //产生<hello-文件名,1> @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { //从输入切片信息中获取正在处理的一行数据所属的文件 FileSplit inputSplit = (FileSplit)context.getInputSplit(); String fileName = inputSplit.getPath().getName(); String[] words = value.toString().split(" "); for (String w : words) { //将单词-文件名作为key,1作为value输出 context.write(new Text(w + "-" + fileName), new IntWritable(1)); } } } public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value : values) { count +=value.get(); } context.write(key, new IntWritable(count)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(IndexStepOne.class); job.setMapperClass(IndexStepOneMapper.class); job.setReducerClass(IndexStepOneReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //job.setOutputFormatClass(TextOutputFormat.class);//这是默认的输出组件。 job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, new Path("F:\\mrdata\\index\\input")); FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out1-sequence")); job.setNumReduceTasks(3); job.waitForCompletion(true); } }
package cn.edu360.mr.index.sequence; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class IndexStepTwo { public static class IndexStepTwoMapper extends Mapper<Text, IntWritable, Text, Text>{ //产生<hello-文件名,1> @Override protected void map(Text key, IntWritable value, Mapper<Text, IntWritable, Text, Text>.Context context) throws IOException, InterruptedException { String[] split = key.toString().split("-"); context.write(new Text(split[0]), new Text(split[1]+"-->"+value)); } } public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{ //一组数据:<hello,a.txt-->4> <hello,b.txt-->4> <hello,c.txt-->4> @Override protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { //StringBuffer是线程安全的,StringBuilder是非线程安全的,在不涉及线程安全下,StringBuilder更快 StringBuilder sb = new StringBuilder(); for (Text value : values) { sb.append(value.toString()).append("\t"); } context.write(key, new Text(sb.toString())); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(IndexStepTwo.class); job.setMapperClass(IndexStepTwoMapper.class); job.setReducerClass(IndexStepTwoReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //job.setInputFormatClass(TextInputFormat.class);默认的输入组件 job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(job, new Path("F:\\mrdata\\index\\out1-sequence")); FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out2-sequence")); job.waitForCompletion(true); } }
可以通过 MapReduce编程小案例.4th—倒排索引创建 来比较两部分代码之间的不同。
主要是job.setOutputFormatClass()和job.setInputFormatClass()两个方法的设置,同时也要注意传输之间的类型;