Hadoop detailed notes (12) mapreduce data analysis case movie scoring case

Free video tutorial  https://www.51doit.com/  or contact the blogger on WeChat 17710299606

1 data

{"movie":"994","rate":"3","timeStamp":"978242080","uid":"5"}
{"movie":"994","rate":"4","timeStamp":"978244540","uid":"3"}
{"movie":"994","rate":"1","timeStamp":"978246576","uid":"2"}
{"movie":"994","rate":"4","timeStamp":"978245568","uid":"2"}
{"movie":"272","rate":"3","timeStamp":"978245487","uid":"1"}
{"movie":"272","rate":"1","timeStamp":"97824}
{"movie":"348","rate":"3","timeStamp":"978241434","uid":"5"}
{"movie":"348","rate":"4","timeStamp":"978245863","uid":"3"}
{"movie":"348","rate":"5","timeStamp":"978241434","uid":"5"}
{"movie":"348","rate":"4","timeStamp":"978245863","uid":"3"}
{"movie":"348","rate":"1","timeStamp":"978241434","uid":"5"}
{"movie":"348","rate":"2","timeStamp":"978245863","uid":"3"}
{"movie":"348","rate":"2","timeStamp":"978245863","uid":"3"}

2 requirements

  1.  Count the total score of each movie
  2.  Count the average score of each movie
package com._51doit.pojo;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/10
 * Description:  封装电影评论数据
 * {"movie":"994","rate":"4","timeStamp":"978245568","uid":"2"}
 */
public class MovieBean {
    /**电影id*/
    private String movie;
    private  double rate ;
    private long timeStamp ;
    private int uid ;

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public double getRate() {
        return rate;
    }

    public void setRate(double rate) {
        this.rate = rate;
    }

    public long getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(long timeStamp) {
        this.timeStamp = timeStamp;
    }

    public int getUid() {
        return uid;
    }

    public void setUid(int uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return "MovieBean{" +
                "movie='" + movie + '\'' +
                ", rate=" + rate +
                ", timeStamp=" + timeStamp +
                ", uid=" + uid +
                '}';
    }
}

3 Count the total score of each movie

package com._51doit.mr.movie;

import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieBean;
import com.alibaba.fastjson.JSON;
import javafx.scene.shape.HLineTo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/10
 * Description:  电影总分案例
 */
public class MovieSumRate {
    static class MovieSumRateMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
        Text k = new Text();
        DoubleWritable v = new DoubleWritable();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = null ;
            try {
                // 每行数据
                line = value.toString();
                // 将每行json格式的数据转换成JavaBean
                MovieBean mb = JSON.parseObject(line, MovieBean.class);
                String movie = mb.getMovie();
                double rate = mb.getRate();
                k.set(movie);
                v.set(rate);
                context.write(k, v);
            } catch (Exception e) {
                System.out.println(line);
            }
        }
    }

    static class MovieSumRateReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
        DoubleWritable v = new DoubleWritable() ;
        /**
         * 相同电影的数据 聚合
         *
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
            double sum = 0d ;
            for (DoubleWritable value : values) {
                double rate = value.get();
                sum+=rate ;
            }
            v.set(sum);
            context.write(key,v);
        }
    }

    public static void main(String[] args) throws Exception {

        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(MovieSumRateMapper.class);
        job.setReducerClass(MovieSumRateReducer.class);
        // 设置map阶段的输出类型
        //job.setMapOutputKeyClass(Text.class);
        // job.setMapOutputValueClass(IntWritable.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
       // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\res2"));

        job.waitForCompletion(true);
    }

}

4 Count the average score of each movie

package com._51doit.mr.movie;

import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieBean;
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/10
 * Description:
 */
public class MovieAvgRate {

    static class MovieAvgRateMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
        Text k = new Text();
        DoubleWritable v = new DoubleWritable();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                MovieBean mb = JSON.parseObject(line, MovieBean.class);
                double rate = mb.getRate();
                String movie = mb.getMovie();
                k.set(movie);
                v.set(rate);
                context.write(k,v);
            } catch (Exception e) {
            }
        }
    }

    static class MovieAvgRateReducer extends Reducer<Text , DoubleWritable , Text ,DoubleWritable>{
        DoubleWritable v =  new  DoubleWritable() ;
        @Override
        protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
            try {
                double sum = 0d ;  //总分
                int count = 0 ; // 次数
                for (DoubleWritable value : values) {
                    double rate = value.get();
                    sum+=rate ;
                    count++ ;
                }
                double avgRate = sum/count ;
                v.set(avgRate);
                context.write(key,v);
            } catch (Exception e) {
            }
        }
    }

    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(MovieAvgRateMapper.class);
        job.setReducerClass(MovieAvgRateReducer.class);
        // 设置map阶段的输出类型
        //job.setMapOutputKeyClass(Text.class);
        // job.setMapOutputValueClass(IntWritable.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\res12"));

        job.waitForCompletion(true);

    }
}

5 Count each person's total review score uid MovieBean

If the custom class is placed in the MR program, it must be serialized.

value:      serialization

Key position:  serialized , sortable

5,1 MovieWritable

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/10
 * Description: 封装电影评论数据
 * 将这个类放在 MapTask的输出的value上 , 当前这个类要序列化
 * 注意 : 不使用JDK自带的序列化方式  冗余数据太多 类全名 字段类型 ...
 * *******  使用HDP提供的序列化方式
 */
public class MovieWritable implements Writable {
    /**
     * 电影id
     */
    private String movie;
    private double rate;
    private long timeStamp;
    private int uid;

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public double getRate() {
        return rate;
    }

    public void setRate(double rate) {
        this.rate = rate;
    }

    public long getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(long timeStamp) {
        this.timeStamp = timeStamp;
    }

    public int getUid() {
        return uid;
    }

    public void setUid(int uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return "Movie{" +
                "movie='" + movie + '\'' +
                ", rate=" + rate +
                ", timeStamp=" + timeStamp +
                ", uid=" + uid +
                '}';
    }
// 注意写出的顺序和读取的顺序一致

    /**
     * 写的方法 序列化方式  序列化方式
     *
     * @param dataOutput
     * @throws IOException
     */
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(movie);
        dataOutput.writeDouble(rate);
        dataOutput.writeLong(timeStamp);
        dataOutput.writeInt(uid);

    }

    /**
     * 读的方法  反序列化
     *
     * @param dataInput
     * @throws IOException
     */
    public void readFields(DataInput dataInput) throws IOException {
        this.movie = dataInput.readUTF();
        this.rate = dataInput.readDouble();
        this.timeStamp = dataInput.readLong();
        this.uid = dataInput.readInt();

    }
}
package com._51doit.mr.movie;

import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieWritable;
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/10
 * Description:
 */
public class MovieSumRateUid {

    static class MovieSumRateUidMapper extends Mapper<LongWritable, Text, IntWritable, MovieWritable> {
        IntWritable k = new IntWritable();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                MovieWritable mw = JSON.parseObject(line, MovieWritable.class);
                k.set(mw.getUid());
                context.write(k, mw);
            } catch (Exception e) {
              //  e.printStackTrace();
            }

        }
    }

    static class MovieSumRateUidReducer extends Reducer<IntWritable, MovieWritable, IntWritable, DoubleWritable> {
        DoubleWritable v = new DoubleWritable();

        @Override
        protected void reduce(IntWritable key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
            double sum = 0d;
            for (MovieWritable mw : values) {
                sum += mw.getRate();
            }
            v.set(sum);
            context.write(key, v);
        }
    }

    public static void main(String[] args) throws Exception {

        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(MovieSumRateUidMapper.class);
        job.setReducerClass(MovieSumRateUidReducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(MovieWritable.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(DoubleWritable.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\uid_sum_rate"));

        job.waitForCompletion(true);
    }


}


6 Count the N highest-rated records in each movie

package com._51doit.mr.movie;

import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieWritable;
import com.alibaba.fastjson.JSON;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

/**
 * Author:   多易教育-行哥
 * Date:     2020/7/12
 * Description: 统计每部电影的评分分数最高的n部记录
 * KEY movieID
 * value MovieWritable
 * 思路
 * 1    maptask  读一行-->MovieWritable--->(movie , MovieWritable)
 * 2    reducetask-->(movie , Iterator<MW>)-->将迭代器中的数据取出存储在list集合中  分数降序排序  输出前n
 */
public class Movie_RataTopN {
    static class Movie_RataTopNMapper extends Mapper<LongWritable, Text, Text, MovieWritable> {
        Text k = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            try {
                String line = value.toString();
                MovieWritable mw = JSON.parseObject(line, MovieWritable.class);
                k.set(mw.getMovie());
                context.write(k, mw);
            } catch (Exception e) {

            }
        }
    }

    static class Movie_RataTopNReducer extends Reducer<Text, MovieWritable, MovieWritable, NullWritable> {
        /**
         * 迭代器中存储几个对象   1
         *
         * @param key
         * @param values
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void reduce(Text key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
            try {
                List<MovieWritable> list = new ArrayList<MovieWritable>();
                // 每个迭代器中只存储一个对象, 如果直接将数据add到list集合中那么最后的结果只有一个数据
                for (MovieWritable mw : values) {
                    MovieWritable mb = new MovieWritable();
                  /*  mb.setMovie(mw.getMovie());
                    mb.setRate(mw.getRate());
                    mb.setTimeStamp(mw.getTimeStamp());
                    mb.setUid(mw.getUid());*/
                    // 复制对象的属性到新的类中  
                    BeanUtils.copyProperties(mb, mw);
                    list.add(mb);// 将对象重复存储了n次 最终的结果是最后一条数据 存储了n次
                }
                // 排序
                Collections.sort(list, new Comparator<MovieWritable>() {
                    public int compare(MovieWritable o1, MovieWritable o2) {
                        return o1.getRate() - o2.getRate() > 0 ? -1 : 1;
                        // return Double.compare(o2.getRate(), o1.getRate()) ;
                        //   return 0;
                    }
                });
                // 输出top3
                for (int i = 0; i < Integer.min(3, list.size()); i++) {
                    context.write(list.get(i), NullWritable.get());
                }
            } catch (Exception e) {

            }

        }
    }


    public static void main(String[] args) throws Exception {
        Logger.getLogger("org").setLevel(Level.ERROR);
        Configuration conf = new Configuration();
        // 参数2  job的名字
        Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());

        job.setMapperClass(Movie_RataTopNMapper.class);
        job.setReducerClass(Movie_RataTopNReducer.class);
        // 设置map阶段的输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(MovieWritable.class);
        // 最终结果的数据类型
        job.setOutputKeyClass(MovieWritable.class);
        job.setOutputValueClass(NullWritable.class);
        // job.setNumReduceTasks(2);  //启动3个reduce任务
        // 待处理数据的路径
        FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\top_res2"));

        job.waitForCompletion(true);

    }
}

 

 

 

 

Guess you like

Origin blog.csdn.net/qq_37933018/article/details/107252220