package rock.lee.wordcount; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class MyWordCount { /** * @author Rock Lee * * @Description * LongWritable,输入 * key类型 Text, * 输入value类型 * Text, 输出key类型 * IntWritable,输出vlaue类型 */ static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private static final IntWritable ONE = new IntWritable(1); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { //读取每行的数据 String lineValue = value.toString(); //对每行数据进行分割\r\n\t StringTokenizer stzer = new StringTokenizer(lineValue); Text text = new Text(); while (stzer.hasMoreTokens()) { //获取分割后的每个值 String val = stzer.nextToken(); //key值 text.set(val); //key-->value context.write(text, ONE); } } } /** * * @author Rock Lee * * @Description */ static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values,Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum+= val.get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { //获取配置信息 Configuration configuration = new Configuration(); //创建任务,设置名称 Job job = new Job(configuration,"WC"); //设置任务运行类 job.setJarByClass(MyWordCount.class); //设置Mapper和Reducer类 job.setMapperClass(MyMapper.class); job.setReducerClass(MyReduce.class); //设置输入/输出路径 FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //设置输出结果key/value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //提交任务,等待运行结果,并在客户端显示运行信息 boolean success = job.waitForCompletion(true); System.exit(success?0:1); } }
运行wc.jar
[root@centos data]# hadoop jar wc.jar /opt/wc/input/ /opt/wc/output Warning: $HADOOP_HOME is deprecated. 15/06/11 04:29:10 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 15/06/11 04:29:10 INFO input.FileInputFormat: Total input paths to process : 2 15/06/11 04:29:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library 15/06/11 04:29:10 WARN snappy.LoadSnappy: Snappy native library not loaded 15/06/11 04:29:10 INFO mapred.JobClient: Running job: job_201506110402_0006 15/06/11 04:29:11 INFO mapred.JobClient: map 0% reduce 0% 15/06/11 04:29:32 INFO mapred.JobClient: map 50% reduce 0% 15/06/11 04:29:42 INFO mapred.JobClient: map 100% reduce 0% 15/06/11 04:30:05 INFO mapred.JobClient: map 100% reduce 100% 15/06/11 04:30:05 INFO mapred.JobClient: Job complete: job_201506110402_0006 15/06/11 04:30:05 INFO mapred.JobClient: Counters: 29 15/06/11 04:30:05 INFO mapred.JobClient: Job Counters 15/06/11 04:30:05 INFO mapred.JobClient: Launched reduce tasks=1 15/06/11 04:30:05 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=40074 15/06/11 04:30:05 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 15/06/11 04:30:05 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 15/06/11 04:30:05 INFO mapred.JobClient: Launched map tasks=2 15/06/11 04:30:05 INFO mapred.JobClient: Data-local map tasks=2 15/06/11 04:30:05 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=21707 15/06/11 04:30:05 INFO mapred.JobClient: File Output Format Counters 15/06/11 04:30:05 INFO mapred.JobClient: Bytes Written=30 15/06/11 04:30:05 INFO mapred.JobClient: FileSystemCounters 15/06/11 04:30:05 INFO mapred.JobClient: FILE_BYTES_READ=96 15/06/11 04:30:05 INFO mapred.JobClient: HDFS_BYTES_READ=260 15/06/11 04:30:05 INFO mapred.JobClient: FILE_BYTES_WRITTEN=160215 15/06/11 04:30:05 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=30 15/06/11 04:30:05 INFO mapred.JobClient: File Input Format Counters 15/06/11 04:30:05 INFO mapred.JobClient: Bytes Read=44 15/06/11 04:30:05 INFO mapred.JobClient: Map-Reduce Framework 15/06/11 04:30:05 INFO mapred.JobClient: Map output materialized bytes=102 15/06/11 04:30:05 INFO mapred.JobClient: Map input records=4 15/06/11 04:30:05 INFO mapred.JobClient: Reduce shuffle bytes=102 15/06/11 04:30:05 INFO mapred.JobClient: Spilled Records=16 15/06/11 04:30:05 INFO mapred.JobClient: Map output bytes=74 15/06/11 04:30:05 INFO mapred.JobClient: CPU time spent (ms)=820 15/06/11 04:30:05 INFO mapred.JobClient: Total committed heap usage (bytes)=413466624 15/06/11 04:30:05 INFO mapred.JobClient: Combine input records=0 15/06/11 04:30:05 INFO mapred.JobClient: SPLIT_RAW_BYTES=216 15/06/11 04:30:05 INFO mapred.JobClient: Reduce input records=8 15/06/11 04:30:05 INFO mapred.JobClient: Reduce input groups=4 15/06/11 04:30:05 INFO mapred.JobClient: Combine output records=0 15/06/11 04:30:05 INFO mapred.JobClient: Physical memory (bytes) snapshot=313032704 15/06/11 04:30:05 INFO mapred.JobClient: Reduce output records=4 15/06/11 04:30:05 INFO mapred.JobClient: Virtual memory (bytes) snapshot=1127878656 15/06/11 04:30:05 INFO mapred.JobClient: Map output records=8
转自: http://mvplee.iteye.com/blog/2218989