MapReduce - 词频统计

统计一个文本的词频

package Test01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class WordCount {
    public static void main(String[] args){
        try{
            //获取配置对象
            Configuration conf = new Configuration();
            //对conf设置
            //获取job
            Job job = Job.getInstance(conf, "WordCount02");
            //对job设置运行主类
            job.setJarByClass(WordCount.class);
            //对job的map端属性设置
            job.setMapperClass(MyMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(LongWritable.class);
            //对job的reduce端属性设置
            job.setReducerClass(MyReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            //设置job的输入路径和输出路径
            FileInputFormat.addInputPath(job, new Path(args[0]));
            FileOutputFormat.setOutputPath(job, new Path(args[1]));
            //提交作业
            int success = job.waitForCompletion(true) ? 0: 1;
            //退出
            System.exit(success);
        }
        catch (Exception e){
            e.printStackTrace();
        }
    }
    //自定义Mapper类
    public static class MyMapper extends Mapper<Object, Text, Text, LongWritable> {
        public Text k =  new Text();
        public LongWritable v = new LongWritable(1L);
        //map函数，map阶段的核心业务逻辑
        @Override
        protected void map(Object key, Text value, Context context){
            try{
                //获取行值
                String row = value.toString();//map中每次读取一行
                //拆分行值
                String[] words = row.split(" ");
                for(String st: words){
                    String st2 = "";
                    char[] ct = st.toCharArray();
                    if ((ct[st.length()-1] >= 'a' && ct[st.length()-1] <= 'z') || (ct[st.length()-1] >= 'A' && ct[st.length()-1] <= 'Z')){
                        st2 = String.valueOf(ct);
                    }
                    else {
                        for(int i = 0; i < st.length()-1; i++){
                            st2 += String.valueOf(ct[i]);
                        }
                    }
                    k.set(st2);
                    context.write(k, v);
                }
            }
            catch (Exception e){
                e.printStackTrace();
            }
        }
    }
    //自定义reducer类
    public static class MyReducer extends Reducer<Text, LongWritable, Text, IntWritable> {
        public IntWritable v = new IntWritable();
        //reduce函数，reduce阶段的核心业务逻辑
        @Override
        protected void reduce(Text key, Iterable<LongWritable> value, Context context){
            try{
                //定义计数器
                int cnt = 0;
                for(LongWritable i: value){
                    cnt += i.get();
                }
                v.set(cnt);
                //reduce的输出
                context.write(key, v);
            }
            catch(Exception e){
                e.printStackTrace();
            }
        }
    }
}

（1）点开IDEA右边的maven，依次点击 Lifecycle - > clean（右键）-> Run Maven Build

（2）依次点击 install（右键）-> Run Maven Build

（3）将target下的jar包发送到Hadoop服务器

（4）输入命令运行

yarn jar MapReducePractice-1.0-SNAPSHOT.jar Test01.WordCount /test/input/t1.txt /test/output/01

（5）查看运行结果

[hadoop@hadoop105 ~]$ hdfs dfs -cat /test/output/01/part-r-00000
Dream 1
Hello 2
I 1
Love 1
World 1
You 2
are 1
hope 1
my 1

leoxry

发布了544 篇原创文章 · 获赞 289 · 访问量 23万+

他的留言板关注

MapReduce - 词频统计

猜你喜欢