hadoop入门级:wordcount字母统计及按字母次序排序

刚刚学习Hadoop不久，代码不够简练直接

思想：

利用hadoop自带的key排序功能，先让A任务统计出字母和字母次数，输出文档，然后依此文档作为B任务的输入，然后排序，将次数作为key字母作为value，实际上就是交换A任务输出文档的key和value。

package wordcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class mc {
	public static void main(String[] args) throws Exception{
		Configuration conf=new Configuration();
		String [] otherargs=new GenericOptionsParser(conf,args).getRemainingArgs();
		if(otherargs.length<2) {
			System.out.println("Usage:wordcount <in> [<in>...] <out>");
			System.exit(2);}
		//构建任务对象
			Job job=Job.getInstance(conf,"word count");
			job.setJarByClass(mc.class);
			job.setMapperClass(TokenizerMapper.class);
			job.setReducerClass(IntSumReducer.class);
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			for(int i=0;i<otherargs.length-1;i++) {
				//设置需要统计的文件输入路径
				FileInputFormat.addInputPath(job,new Path(otherargs[i]));}
			//老版本需要强转为jobConf
			FileOutputFormat.setOutputPath((JobConf)job.getConfiguration(),new Path(otherargs[(otherargs.length-1)]));
			boolean b = job.waitForCompletion(true);
			//判断A任务是否结束，如果结束则进行下一个任务
	        if(b){
	        	Configuration confs=new Configuration();
	            Job job2 = Job.getInstance(confs,"word count2");
	            job2.setJarByClass(mc.class);
	            job2.setMapperClass(TokenizerMapper2.class);
	            job2.setOutputKeyClass(IntWritable.class);
	            job2.setOutputValueClass(Text.class);
	            FileInputFormat.setInputPaths(job2,new Path("hdfs:/output/part-r-00000"));
	            FileOutputFormat.setOutputPath((JobConf)job2.getConfiguration(),new Path("hdfs:/output2"));
	            boolean b1 = job2.waitForCompletion(true);
	            //如果B任务完成，则删除A任务留下的output保证一个文件夹输出
	        	if(b1) {
		        	FileSystem fs=FileSystem.get(conf);
		        	Path path=new Path("hdfs:/output");
		        	boolean isok=fs.deleteOnExit(path);
		        	if(isok){
		        		System.exit(b1 ? 0 : 1);
		        	}}}else{
	            System.exit(1);
	        }}
	public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	    private IntWritable result = new IntWritable();

	    public void reduce(Text key, Iterable<IntWritable> values,Reducer<Text,IntWritable,Text,IntWritable>.Context context) throws IOException,InterruptedException {
	        //统计单词总数量
	    	int sum = 0;
	        for(IntWritable val:values) {
	            sum += val.get();
	        }
	        this.result.set(sum);
	        context.write(key,this.result);
	    }}
	public static class TokenizerMapper2 extends Mapper<Object, Text, IntWritable, Text> {
	    IntWritable one = new IntWritable();
	    private Text word = new Text();

	    public void map(Object key, Text value, Context context) throws IOException,InterruptedException {
	        String string = new String(value.toString());
	        String[] stringsplit=string.split("\t");
	        // 以tab键划分
	        for(int i=0;i<stringsplit.length;i++) {
	        	// 由A任务输出的数据可知：偶数为数字 奇数为字母
	        	if(i%2==0) {
	        		word.set(stringsplit[i]);
	        		one.set(Integer.parseInt(stringsplit[i+1]));
	        		context.write(one, this.word);
	        	}}}}	
	public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
	    IntWritable one = new IntWritable(1);
	    private Text word = new Text();
	    public void map(Object key, Text value, Context context) throws IOException,InterruptedException {
	        String string = new String(value.toString());
	        //文本处理
		String str=string.replaceAll("[`éê￡0123456789~!@#$%^&*()+=|{}':;',\\[\\].<>/?~！@#￥%……&;*（）——+|{}【】‘；：”“’。，、？|-]", "");
	       //文本切割
		str = str.trim();
		//此处决定是按字母分割还是按单词分割
	    String[] stringsplit=str.split("");
	        //写入 <a,[1,1,1,……]>
	     for(int i=0;i<stringsplit.length;i++) {
	            word.set(stringsplit[i]);
	            context.write(this.word, one);
	        }}}}

菜鸟上路_lbz

发布了72 篇原创文章 · 获赞 206 · 访问量 4万+

私信关注

hadoop入门级:wordcount字母统计及按字母次序排序

思想：

猜你喜欢