编写自己WordCount程序
package rock.lee.wordcount; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class MyWordCount { /** * @author Rock Lee * * @Description * LongWritable,输入 * key类型 Text, * 输入value类型 * Text, 输出key类型 * IntWritable,输出vlaue类型 */ static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private static final IntWritable ONE = new IntWritable(1); @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { //读取每行的数据 String lineValue = value.toString(); //对每行数据进行分割\r\n\t StringTokenizer stzer = new StringTokenizer(lineValue); Text text = new Text(); while (stzer.hasMoreTokens()) { //获取分割后的每个值 String val = stzer.nextToken(); //key值 text.set(val); //key-->value context.write(text, ONE); } } } /** * * @author Rock Lee * * @Description */ static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values,Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum+= val.get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { //获取配置信息 Configuration configuration = new Configuration(); //创建任务,设置名称 Job job = new Job(configuration,"WC"); //设置任务运行类 job.setJarByClass(MyWordCount.class); //设置Mapper和Reducer类 job.setMapperClass(MyMapper.class); job.setReducerClass(MyReduce.class); //设置输入/输出路径 FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //设置输出结果key/value类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //提交任务,等待运行结果,并在客户端显示运行信息 boolean success = job.waitForCompletion(true); System.exit(success?0:1); } }
将程序打包成wc.jar上传到Linux上,创建文件one、two
[root@centos data]# more one hello word bye word [root@centos data]# more two hello word bye hadoop
把文件上传到HDFS上,目录为/opt/wc/input
[root@centos data]# hadoop fs -lsr / -rw-r--r-- 1 root supergroup 21 2015-06-11 04:08 /opt/wc/input/one -rw-r--r-- 1 root supergroup 23 2015-06-11 04:08 /opt/wc/input/two
运行wc.jar
[root@centos data]# hadoop jar wc.jar /opt/wc/input/ /opt/wc/output Warning: $HADOOP_HOME is deprecated. 15/06/11 04:29:10 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 15/06/11 04:29:10 INFO input.FileInputFormat: Total input paths to process : 2 15/06/11 04:29:10 INFO util.NativeCodeLoader: Loaded the native-hadoop library 15/06/11 04:29:10 WARN snappy.LoadSnappy: Snappy native library not loaded 15/06/11 04:29:10 INFO mapred.JobClient: Running job: job_201506110402_0006 15/06/11 04:29:11 INFO mapred.JobClient: map 0% reduce 0% 15/06/11 04:29:32 INFO mapred.JobClient: map 50% reduce 0% 15/06/11 04:29:42 INFO mapred.JobClient: map 100% reduce 0% 15/06/11 04:30:05 INFO mapred.JobClient: map 100% reduce 100% 15/06/11 04:30:05 INFO mapred.JobClient: Job complete: job_201506110402_0006 15/06/11 04:30:05 INFO mapred.JobClient: Counters: 29 15/06/11 04:30:05 INFO mapred.JobClient: Job Counters 15/06/11 04:30:05 INFO mapred.JobClient: Launched reduce tasks=1 15/06/11 04:30:05 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=40074 15/06/11 04:30:05 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0 15/06/11 04:30:05 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0 15/06/11 04:30:05 INFO mapred.JobClient: Launched map tasks=2 15/06/11 04:30:05 INFO mapred.JobClient: Data-local map tasks=2 15/06/11 04:30:05 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=21707 15/06/11 04:30:05 INFO mapred.JobClient: File Output Format Counters 15/06/11 04:30:05 INFO mapred.JobClient: Bytes Written=30 15/06/11 04:30:05 INFO mapred.JobClient: FileSystemCounters 15/06/11 04:30:05 INFO mapred.JobClient: FILE_BYTES_READ=96 15/06/11 04:30:05 INFO mapred.JobClient: HDFS_BYTES_READ=260 15/06/11 04:30:05 INFO mapred.JobClient: FILE_BYTES_WRITTEN=160215 15/06/11 04:30:05 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=30 15/06/11 04:30:05 INFO mapred.JobClient: File Input Format Counters 15/06/11 04:30:05 INFO mapred.JobClient: Bytes Read=44 15/06/11 04:30:05 INFO mapred.JobClient: Map-Reduce Framework 15/06/11 04:30:05 INFO mapred.JobClient: Map output materialized bytes=102 15/06/11 04:30:05 INFO mapred.JobClient: Map input records=4 15/06/11 04:30:05 INFO mapred.JobClient: Reduce shuffle bytes=102 15/06/11 04:30:05 INFO mapred.JobClient: Spilled Records=16 15/06/11 04:30:05 INFO mapred.JobClient: Map output bytes=74 15/06/11 04:30:05 INFO mapred.JobClient: CPU time spent (ms)=820 15/06/11 04:30:05 INFO mapred.JobClient: Total committed heap usage (bytes)=413466624 15/06/11 04:30:05 INFO mapred.JobClient: Combine input records=0 15/06/11 04:30:05 INFO mapred.JobClient: SPLIT_RAW_BYTES=216 15/06/11 04:30:05 INFO mapred.JobClient: Reduce input records=8 15/06/11 04:30:05 INFO mapred.JobClient: Reduce input groups=4 15/06/11 04:30:05 INFO mapred.JobClient: Combine output records=0 15/06/11 04:30:05 INFO mapred.JobClient: Physical memory (bytes) snapshot=313032704 15/06/11 04:30:05 INFO mapred.JobClient: Reduce output records=4 15/06/11 04:30:05 INFO mapred.JobClient: Virtual memory (bytes) snapshot=1127878656 15/06/11 04:30:05 INFO mapred.JobClient: Map output records=8
/opt/wc/output目录不能存在,如果存在会有异常
Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory /opt/wc/output already exists
查看结果
[root@centos data]# hadoop fs -text /opt/wc/output/part-r-00000 Warning: $HADOOP_HOME is deprecated. bye 2 hadoop 1 hello 2 word 3
通过web方式,访问50075端口查看,这种方式不能直接通过IP访问,只能通过Linux主机名称,所以要在windows的hosts文件配置主机名称到IP地址的映射
WorldCount程序流程详解