Free video tutorial https://www.51doit.com/ or contact the blogger on WeChat 17710299606
1 data
{"movie":"994","rate":"3","timeStamp":"978242080","uid":"5"}
{"movie":"994","rate":"4","timeStamp":"978244540","uid":"3"}
{"movie":"994","rate":"1","timeStamp":"978246576","uid":"2"}
{"movie":"994","rate":"4","timeStamp":"978245568","uid":"2"}
{"movie":"272","rate":"3","timeStamp":"978245487","uid":"1"}
{"movie":"272","rate":"1","timeStamp":"97824}
{"movie":"348","rate":"3","timeStamp":"978241434","uid":"5"}
{"movie":"348","rate":"4","timeStamp":"978245863","uid":"3"}
{"movie":"348","rate":"5","timeStamp":"978241434","uid":"5"}
{"movie":"348","rate":"4","timeStamp":"978245863","uid":"3"}
{"movie":"348","rate":"1","timeStamp":"978241434","uid":"5"}
{"movie":"348","rate":"2","timeStamp":"978245863","uid":"3"}
{"movie":"348","rate":"2","timeStamp":"978245863","uid":"3"}
2 requirements
- Count the total score of each movie
- Count the average score of each movie
package com._51doit.pojo;
/**
* Author: 多易教育-行哥
* Date: 2020/7/10
* Description: 封装电影评论数据
* {"movie":"994","rate":"4","timeStamp":"978245568","uid":"2"}
*/
public class MovieBean {
/**电影id*/
private String movie;
private double rate ;
private long timeStamp ;
private int uid ;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public double getRate() {
return rate;
}
public void setRate(double rate) {
this.rate = rate;
}
public long getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(long timeStamp) {
this.timeStamp = timeStamp;
}
public int getUid() {
return uid;
}
public void setUid(int uid) {
this.uid = uid;
}
@Override
public String toString() {
return "MovieBean{" +
"movie='" + movie + '\'' +
", rate=" + rate +
", timeStamp=" + timeStamp +
", uid=" + uid +
'}';
}
}
3 Count the total score of each movie
package com._51doit.mr.movie;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieBean;
import com.alibaba.fastjson.JSON;
import javafx.scene.shape.HLineTo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
/**
* Author: 多易教育-行哥
* Date: 2020/7/10
* Description: 电影总分案例
*/
public class MovieSumRate {
static class MovieSumRateMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
Text k = new Text();
DoubleWritable v = new DoubleWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = null ;
try {
// 每行数据
line = value.toString();
// 将每行json格式的数据转换成JavaBean
MovieBean mb = JSON.parseObject(line, MovieBean.class);
String movie = mb.getMovie();
double rate = mb.getRate();
k.set(movie);
v.set(rate);
context.write(k, v);
} catch (Exception e) {
System.out.println(line);
}
}
}
static class MovieSumRateReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
DoubleWritable v = new DoubleWritable() ;
/**
* 相同电影的数据 聚合
*
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
double sum = 0d ;
for (DoubleWritable value : values) {
double rate = value.get();
sum+=rate ;
}
v.set(sum);
context.write(key,v);
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(MovieSumRateMapper.class);
job.setReducerClass(MovieSumRateReducer.class);
// 设置map阶段的输出类型
//job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(IntWritable.class);
// 最终结果的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\res2"));
job.waitForCompletion(true);
}
}
4 Count the average score of each movie
package com._51doit.mr.movie;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieBean;
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
/**
* Author: 多易教育-行哥
* Date: 2020/7/10
* Description:
*/
public class MovieAvgRate {
static class MovieAvgRateMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
Text k = new Text();
DoubleWritable v = new DoubleWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String line = value.toString();
MovieBean mb = JSON.parseObject(line, MovieBean.class);
double rate = mb.getRate();
String movie = mb.getMovie();
k.set(movie);
v.set(rate);
context.write(k,v);
} catch (Exception e) {
}
}
}
static class MovieAvgRateReducer extends Reducer<Text , DoubleWritable , Text ,DoubleWritable>{
DoubleWritable v = new DoubleWritable() ;
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
try {
double sum = 0d ; //总分
int count = 0 ; // 次数
for (DoubleWritable value : values) {
double rate = value.get();
sum+=rate ;
count++ ;
}
double avgRate = sum/count ;
v.set(avgRate);
context.write(key,v);
} catch (Exception e) {
}
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(MovieAvgRateMapper.class);
job.setReducerClass(MovieAvgRateReducer.class);
// 设置map阶段的输出类型
//job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(IntWritable.class);
// 最终结果的数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\res12"));
job.waitForCompletion(true);
}
}
5 Count each person's total review score uid MovieBean
If the custom class is placed in the MR program, it must be serialized.
value: serialization
Key position: serialized , sortable
5,1 MovieWritable
/**
* Author: 多易教育-行哥
* Date: 2020/7/10
* Description: 封装电影评论数据
* 将这个类放在 MapTask的输出的value上 , 当前这个类要序列化
* 注意 : 不使用JDK自带的序列化方式 冗余数据太多 类全名 字段类型 ...
* ******* 使用HDP提供的序列化方式
*/
public class MovieWritable implements Writable {
/**
* 电影id
*/
private String movie;
private double rate;
private long timeStamp;
private int uid;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public double getRate() {
return rate;
}
public void setRate(double rate) {
this.rate = rate;
}
public long getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(long timeStamp) {
this.timeStamp = timeStamp;
}
public int getUid() {
return uid;
}
public void setUid(int uid) {
this.uid = uid;
}
@Override
public String toString() {
return "Movie{" +
"movie='" + movie + '\'' +
", rate=" + rate +
", timeStamp=" + timeStamp +
", uid=" + uid +
'}';
}
// 注意写出的顺序和读取的顺序一致
/**
* 写的方法 序列化方式 序列化方式
*
* @param dataOutput
* @throws IOException
*/
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(movie);
dataOutput.writeDouble(rate);
dataOutput.writeLong(timeStamp);
dataOutput.writeInt(uid);
}
/**
* 读的方法 反序列化
*
* @param dataInput
* @throws IOException
*/
public void readFields(DataInput dataInput) throws IOException {
this.movie = dataInput.readUTF();
this.rate = dataInput.readDouble();
this.timeStamp = dataInput.readLong();
this.uid = dataInput.readInt();
}
}
package com._51doit.mr.movie;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieWritable;
import com.alibaba.fastjson.JSON;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
/**
* Author: 多易教育-行哥
* Date: 2020/7/10
* Description:
*/
public class MovieSumRateUid {
static class MovieSumRateUidMapper extends Mapper<LongWritable, Text, IntWritable, MovieWritable> {
IntWritable k = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String line = value.toString();
MovieWritable mw = JSON.parseObject(line, MovieWritable.class);
k.set(mw.getUid());
context.write(k, mw);
} catch (Exception e) {
// e.printStackTrace();
}
}
}
static class MovieSumRateUidReducer extends Reducer<IntWritable, MovieWritable, IntWritable, DoubleWritable> {
DoubleWritable v = new DoubleWritable();
@Override
protected void reduce(IntWritable key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
double sum = 0d;
for (MovieWritable mw : values) {
sum += mw.getRate();
}
v.set(sum);
context.write(key, v);
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(MovieSumRateUidMapper.class);
job.setReducerClass(MovieSumRateUidReducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(MovieWritable.class);
// 最终结果的数据类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(DoubleWritable.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\uid_sum_rate"));
job.waitForCompletion(true);
}
}
6 Count the N highest-rated records in each movie
package com._51doit.mr.movie;
import com._51doit.mr.line.LineDemo;
import com._51doit.pojo.MovieWritable;
import com.alibaba.fastjson.JSON;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* Author: 多易教育-行哥
* Date: 2020/7/12
* Description: 统计每部电影的评分分数最高的n部记录
* KEY movieID
* value MovieWritable
* 思路
* 1 maptask 读一行-->MovieWritable--->(movie , MovieWritable)
* 2 reducetask-->(movie , Iterator<MW>)-->将迭代器中的数据取出存储在list集合中 分数降序排序 输出前n
*/
public class Movie_RataTopN {
static class Movie_RataTopNMapper extends Mapper<LongWritable, Text, Text, MovieWritable> {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
try {
String line = value.toString();
MovieWritable mw = JSON.parseObject(line, MovieWritable.class);
k.set(mw.getMovie());
context.write(k, mw);
} catch (Exception e) {
}
}
}
static class Movie_RataTopNReducer extends Reducer<Text, MovieWritable, MovieWritable, NullWritable> {
/**
* 迭代器中存储几个对象 1
*
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
try {
List<MovieWritable> list = new ArrayList<MovieWritable>();
// 每个迭代器中只存储一个对象, 如果直接将数据add到list集合中那么最后的结果只有一个数据
for (MovieWritable mw : values) {
MovieWritable mb = new MovieWritable();
/* mb.setMovie(mw.getMovie());
mb.setRate(mw.getRate());
mb.setTimeStamp(mw.getTimeStamp());
mb.setUid(mw.getUid());*/
// 复制对象的属性到新的类中
BeanUtils.copyProperties(mb, mw);
list.add(mb);// 将对象重复存储了n次 最终的结果是最后一条数据 存储了n次
}
// 排序
Collections.sort(list, new Comparator<MovieWritable>() {
public int compare(MovieWritable o1, MovieWritable o2) {
return o1.getRate() - o2.getRate() > 0 ? -1 : 1;
// return Double.compare(o2.getRate(), o1.getRate()) ;
// return 0;
}
});
// 输出top3
for (int i = 0; i < Integer.min(3, list.size()); i++) {
context.write(list.get(i), NullWritable.get());
}
} catch (Exception e) {
}
}
}
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Configuration conf = new Configuration();
// 参数2 job的名字
Job job = Job.getInstance(conf, new LineDemo().getClass().getSimpleName());
job.setMapperClass(Movie_RataTopNMapper.class);
job.setReducerClass(Movie_RataTopNReducer.class);
// 设置map阶段的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MovieWritable.class);
// 最终结果的数据类型
job.setOutputKeyClass(MovieWritable.class);
job.setOutputValueClass(NullWritable.class);
// job.setNumReduceTasks(2); //启动3个reduce任务
// 待处理数据的路径
FileInputFormat.setInputPaths(job, new Path("D:\\data\\movie\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\movie\\top_res2"));
job.waitForCompletion(true);
}
}