MapReduce编程小案例.8th—替换默认的文本输入输出组件为sequence文件输入输出组件

MapReduce编程小案例.8th—替换默认的文本输入输出组件为sequence文件输入输出组件

实现代码:

同样分2步实现

package cn.edu360.mr.index.sequence;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;



public class IndexStepOne {

public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
		
		//产生<hello-文件名,1>
		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			
			//从输入切片信息中获取正在处理的一行数据所属的文件
			FileSplit inputSplit = (FileSplit)context.getInputSplit();
			String fileName = inputSplit.getPath().getName();
			
			String[] words = value.toString().split(" ");
			for (String w : words) {
				
				//将单词-文件名作为key,1作为value输出
				context.write(new Text(w + "-" + fileName), new IntWritable(1));
				
			}


		}
		
	}
	
	public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {

			int count = 0;
			for (IntWritable value : values) {
				count +=value.get();
				
			}
			context.write(key, new IntWritable(count));
		}
	}
	
	
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(IndexStepOne.class);
		
		job.setMapperClass(IndexStepOneMapper.class);
		job.setReducerClass(IndexStepOneReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		//job.setOutputFormatClass(TextOutputFormat.class);//这是默认的输出组件。
		job.setOutputFormatClass(SequenceFileOutputFormat.class);
		
		FileInputFormat.addInputPath(job, new Path("F:\\mrdata\\index\\input"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out1-sequence"));
		
		job.setNumReduceTasks(3);
		job.waitForCompletion(true);
		
		
		
		
		
		
	}
}
package cn.edu360.mr.index.sequence;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;



public class IndexStepTwo {

public static class IndexStepTwoMapper extends Mapper<Text, IntWritable, Text, Text>{
		
		//产生<hello-文件名,1>
		@Override
		protected void map(Text key, IntWritable value, Mapper<Text, IntWritable, Text, Text>.Context context)
				throws IOException, InterruptedException {
			
              String[] split = key.toString().split("-");
              context.write(new Text(split[0]), new Text(split[1]+"-->"+value));
			


		}
		
	}
	
	public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{
		
		//一组数据:<hello,a.txt-->4> <hello,b.txt-->4> <hello,c.txt-->4>
		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
           //StringBuffer是线程安全的,StringBuilder是非线程安全的,在不涉及线程安全下,StringBuilder更快
			StringBuilder sb = new StringBuilder();
			
			for (Text value : values) {
				sb.append(value.toString()).append("\t");
				
			}
			context.write(key, new Text(sb.toString()));
			

		}
	}
	
	
	
	public static void main(String[] args) throws Exception {
		
		Configuration conf = new Configuration();
		
		Job job = Job.getInstance(conf);
		
		job.setJarByClass(IndexStepTwo.class);
		
		job.setMapperClass(IndexStepTwoMapper.class);
		job.setReducerClass(IndexStepTwoReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//job.setInputFormatClass(TextInputFormat.class);默认的输入组件
		job.setInputFormatClass(SequenceFileInputFormat.class);
		
		
		FileInputFormat.addInputPath(job, new Path("F:\\mrdata\\index\\out1-sequence"));
		FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out2-sequence"));
		
      
		job.waitForCompletion(true);
    }
}

可以通过 MapReduce编程小案例.4th—倒排索引创建 来比较两部分代码之间的不同。

主要是job.setOutputFormatClass()和job.setInputFormatClass()两个方法的设置,同时也要注意传输之间的类型;


猜你喜欢

转载自blog.csdn.net/robertdowneylm/article/details/80327861