1 wordcount
2 倒排序
3 自定义分区(不同规则输出到不同的文件)
4 自定义文件输出
5 统计文件流
1 自定义输出类
package com.wzt.mapreduce.custom; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * @author root * * @param <Text> reduce 输出的key类型 value类型 * @param <LongWritable> */ public class MyCustomOutputFormat<Text, LongWritable> extends FileOutputFormat<Text, LongWritable>{ @Override public RecordWriter<Text, LongWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); FileSystem hdfs = FileSystem.get(conf); FSDataOutputStream os1 = hdfs.create(new Path("/wc/output1/file1.log")); FSDataOutputStream os2 = hdfs.create(new Path("/wc/output2/file2.log")); return new MyRecordWriter<Text, LongWritable>( os1, os2); } public static class MyRecordWriter<Text, LongWritable> extends RecordWriter<Text, LongWritable>{ FSDataOutputStream os1 = null ; FSDataOutputStream os2 = null; public MyRecordWriter(FSDataOutputStream os1, FSDataOutputStream os2) { this.os1 = os1 ; this.os2 = os2 ; } @Override public void write(Text key, LongWritable value2) throws IOException, InterruptedException { Long hang = Long.parseLong( value2.toString()); if(hang%2==0){ os1.writeBytes(key.toString() ); }else{ os2.writeBytes(key.toString() ); } } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { if(os1!=null){ os1.close(); } if(os2!=null){ os2.close(); } } } }
2 Mapper 数据整理类
package com.wzt.mapreduce.custom; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class CIOMapper extends Mapper<LongWritable, Text , Text, LongWritable >{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = StringUtils.split(line, " "); for(String word :words ){ context.write( new Text(word) , key );; } } }
3 运行的主类(Map中数据直接输出所以没有使用到reducer)
package com.wzt.mapreduce.custom; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class CIORunner { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration() ; Job job = Job.getInstance(conf) ; job.setJarByClass(CIORunner.class ); job.setMapperClass( CIOMapper.class ); //job.setReducerClass( CIOReducer.class ); 没有reducer就不用了 job.setOutputKeyClass( Text.class ); job.setOutputValueClass(LongWritable.class ); job.setMapOutputKeyClass( Text.class); job.setMapOutputValueClass( LongWritable.class ); job.setOutputFormatClass(MyCustomOutputFormat.class); FileInputFormat.setInputPaths(job, "/wc/input/xiyou.txt"); FileOutputFormat.setOutputPath(job, new Path("/wc/outputcount")); // FileInputFormat.setInputPaths(job, "D:\\wordcount\\wordcount.txt"); // FileOutputFormat.setOutputPath(job, new Path("D:\\wordcount\\output")); job.waitForCompletion(true) ; } }