统计单词在每个文件中出现的次数，并且将出现次数按照降序排列

package kaoshi3;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

//统计单词在每个文件中出现的次数，并且将出现次数按照降序排列

public class wordcount {
    static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
        Text mk=new Text();
        Text mv=new Text();
        String filename="";
        @Override
            //setup job任务运行时加载一次，可以获取文件信息
        protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            InputSplit insplit = context.getInputSplit();   //通过上下文对象，获取切片文件的切片信息
            FileSplit fs=(FileSplit)insplit;                //转换其类型，inputsplit中没有获取文件名的
            filename = fs.getPath().getName();              //获取文件名
        }
        @Override
        protected void map(LongWritable key, 
                Text value,
                Context context)
                throws IOException, InterruptedException {

            //liangchaowei love liujialing
            String[] sp = value.toString().split(" ");
            for(String word: sp){
                mk.set(word);
                mv.set(filename);               
                context.write(mk, mv);          //单词作为key，文件名作为value发送
            }
        }
    }
    static class MyReducer extends Reducer<Text, Text, Text, Text>{
        Text t = new Text();
        String sv="";
        int sum1=0;
        int sum2=0;
        String file1="";
        String file2="";
        String out1="";
        String out2="";
        @Override
        protected void reduce(Text key, 
                Iterable<Text> values, 
                Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            for(Text v:values){
                sv = v.toString();
                if(sv.startsWith("mapreduce-4-1.txt")){
                    file1=sv;               //文件名
                    sum1++;                 //获取次数
                }else{                      
                    file2=sv;               //获取次数
                    sum2++;                 //文件名
                }
            }
            if(sum1>sum2){                  //判断排序
                out1=file1+":"+sum1+"\t"+file2+":"+sum2;//输出数据比较少就用了String类型
                t.set(out1);
                context.write(key,t);
            }else{
                out2=file2+":"+sum2+"\t"+file1+":"+sum1;
                t.set(out2);
                context.write(key,t);
            }
        }
    }

    public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        //本地运行添加
        System.setProperty("HADOOP_USER_NAME", "hadoop");
        //添加配置文件
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);            //创建一个job任务

        job.setJarByClass(kaoshi3.wordcount.class); //指定驱动类的加载路径

        job.setMapperClass(MyMapper.class);         //指定mapper的加载类
        job.setReducerClass(MyReducer.class);       //指定reduce加载类

        job.setOutputKeyClass(Text.class);          //设置输出key的类型
        job.setOutputValueClass(Text.class);        //设置输出valse的类型

        FileInputFormat.addInputPath(job, new Path("hdfs://hadoop01:9000/ksin02"));     //添加输入路径

        FileSystem fs = FileSystem.get(new URI("hdfs://hadoop01:9000"), conf);

        Path path = new Path("/ksout02");                       //输出路径
        if(fs.exists(path)){                                    //判断输出路径是否存在
            fs.delete(path,true);
        }
        FileOutputFormat.setOutputPath(job, path);
        job.waitForCompletion(true);
    }

}
统计单词在每个文件中出现的次数，并且将出现次数按照降序排列

猜你喜欢