--- wordcount mapreduce of entry procedures

mapreduce is a very important part hadoop ecology, by definition, is divided into two parts, map and reduce, they carry out their duties, the main function map is used to treat the document processing are processed, the data is read mainly by row take, segmentation, and then determined according to different user needs, clear, clean, until the target data is obtained. reduce the program mainly on the aggregated data from the map, the summation. And finally by the results of the statistical inputs to the target file. Specific code as follows: WCMapper.java import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org. apache.hadoop.mapreduce.Mapper; // four generic types, the first two is the type of mapper KENY input key input data type, VALUIN refers to the value of // Map and reduce the input and output data are // default is passed by the key-value exists in the form of a frame to our mapper frame input data, key start position is the offset to be processed in one line of text, the contents of this line as the value public class WCMapper extends Mapper {// mapredure framework of each row of data will call a method, recursive call @Override protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {// specific business logic written in this approach, and we data is transferred to the frame processing has come, the key-value // key parameter of the method of this fact, the offset value is the content of the line data of the line // contents of these line to a String String line = value .toString (); // this line is divided content String [] words = StringUtils.split (line, ""); // iterate, k kv in the form of: a word v: 1 for (String word: words) {context.write (new Text (word), new LongWritable (1));}}} WCReducer.java import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io .Text; import org.apache.hadoop.mapreduce.Reducer; public class WCReducer extends Reducer {// After completion of data processing on the map, all of the cached kv, grouping, and then pass a packet , Called once reduce // @Override protected void reduce(Text key, Iterable values, Context contest) throws IOException, InterruptedException {long count = 0; // traverse valueslist, accumulated sum for (LongWritable value: values) {count + = value.get ();} // a statistical result output file contest.write (key, new LongWritable (count));}} WCRunner.java import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org. apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache. hadoop.mapreduce.lib.output.FileOutputFormat; public class WCRunner {// used to describe a specific job, such job specifies that class is changed logic processing map, that is a logic-based hair treatment can reduce // specify the path where the change of job data to be processed can also specify which path // output job is finished into the public static void main (String [] args) throws IOException, ClassNotFoundException,InterruptedException {Configuration conf = new Configuration (); Job job = Job.getInstance (conf); // set position job.setJarByClass job class is in use to the entire jar package (WCRunner.class); // this job is mapper and reduce job.setMapperClass used (WCMapper.class); job.setReducerClass (WCReducer.class); // specified to reduce kv type job.setOutputKeyClass (Text.class); job.setOutputValueClass (LongWritable.class); // Specifies the type of the map kv job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // specified metadata specifies where the file path here // file path may be the windows, and may be in hdfs path, can communicate with each other is provided FileInputFormat.setInputPaths (job, new path ( "/ home / hadoop / data-shixun / wordcount / srcdata.txt")); FileOutputFormat.setOutputPath (job, new path ( "/ home / hadoop / data-shixun / wordcount / output ")); // the job submitted to the cluster job.waitForCompletion (true);}} code section has a corresponding detailed comments.setJarByClass (WCRunner.class); // mapper and reduce job.setMapperClass (WCMapper.class) used in the present job; job.setReducerClass (WCReducer.class); // specified to reduce kv type job.setOutputKeyClass (Text.class) ; job.setOutputValueClass (LongWritable.class); // the specified map kv type job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // specified metadata specifies where the file path where both // the file path may be the windows, may be in the path hdfs, provided that can communicate with each other FileInputFormat.setInputPaths (job, new path ( "/ home / hadoop / data-shixun / wordcount / srcdata.txt")); FileOutputFormat. setOutputPath (job, new Path ( "/ home / hadoop / data-shixun / wordcount / output")); // will be submitted to the cluster job.waitForCompletion (true) job;}} code section has a corresponding detailed comments.setJarByClass (WCRunner.class); // mapper and reduce job.setMapperClass (WCMapper.class) used in the present job; job.setReducerClass (WCReducer.class); // specified to reduce kv type job.setOutputKeyClass (Text.class) ; job.setOutputValueClass (LongWritable.class); // the specified map kv type job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // specified metadata specifies where the file path where both // the file path may be the windows, may be in the path hdfs, provided that can communicate with each other FileInputFormat.setInputPaths (job, new path ( "/ home / hadoop / data-shixun / wordcount / srcdata.txt")); FileOutputFormat. setOutputPath (job, new Path ( "/ home / hadoop / data-shixun / wordcount / output")); // will be submitted to the cluster job.waitForCompletion (true) job;}} code section has a corresponding detailed comments.setOutputValueClass (LongWritable.class); // the specified map kv type job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // specified metadata specifies the file path where // here may be the windows the file path may be the path hdfs, provided that can communicate with each other FileInputFormat.setInputPaths (job, new path ( "/ home / hadoop / data-shixun / wordcount / srcdata.txt")); FileOutputFormat.setOutputPath (job , new Path ( "/ home / hadoop / data-shixun / wordcount / output")); // the job submitted to the cluster job.waitForCompletion (true);}} code section has a corresponding detailed comments.setOutputValueClass (LongWritable.class); // the specified map kv type job.setMapOutputKeyClass (Text.class); job.setMapOutputValueClass (LongWritable.class); // specified metadata specifies the file path where // here may be the windows the file path may be the path hdfs, provided that can communicate with each other FileInputFormat.setInputPaths (job, new path ( "/ home / hadoop / data-shixun / wordcount / srcdata.txt")); FileOutputFormat.setOutputPath (job , new Path ( "/ home / hadoop / data-shixun / wordcount / output")); // the job submitted to the cluster job.waitForCompletion (true);}} code section has a corresponding detailed comments./ Home / hadoop / data-shixun / wordcount / output ")); // the job submitted to the cluster job.waitForCompletion (true);}} code section has a corresponding detailed comments./ Home / hadoop / data-shixun / wordcount / output ")); // the job submitted to the cluster job.waitForCompletion (true);}} code section has a corresponding detailed comments.

Guess you like

Origin www.cnblogs.com/ljysy/p/11455790.html