求每个同学每科成绩的总分
chinese.txt
english.txt
math.txt
1.新建 path : score , 上传源文件
Mapper.java
package com.study.score.day01; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; /** * 统计每个人三科的每科各月的总成绩 * key : 姓名 * value : student * Map : 映射数据 * * Mapper 数量 = 切片的数量 */ public class ScoreMapper extends Mapper<LongWritable, Text, Text, Student> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Student>.Context context) throws IOException, InterruptedException { // 文件名称,根据文件名称判断成绩是哪个科目的 FileSplit split = (FileSplit) context.getInputSplit(); String textName = split.getPath().getName(); String [] textNames = textName.split("\\."); textName = textNames[0]; // 每行的内容 // 1 zhang 89 月份 姓名 成绩 String lineContent = value.toString() ; // String [] datas = lineContent.split(" "); String name = datas[1]; String score = datas[2]; Student student = new Student(); student.setName(name); if("chinese".equals(textName)){ student.setChinese(Integer.valueOf(score)); student.setEnglish(0); student.setMath(0); }else if("english".equals(textName)){ student.setEnglish(Integer.valueOf(score)); student.setMath(0); student.setChinese(0); }else if("math".equals(textName)){ student.setMath(Integer.valueOf(score)); student.setChinese(0); student.setEnglish(0); } context.write(new Text(name), student); } }
2.Reducer.java
package com.study.score.day01; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class ScoreReducer extends Reducer<Text, Student, Text, Student> { @Override protected void reduce(Text key, Iterable<Student> values, Reducer<Text, Student, Text, Student>.Context context) throws IOException, InterruptedException { Student student = new Student(); student.setName(key.toString()); Integer chinese = 0; Integer english = 0 ; Integer math = 0 ; for(Student stu : values){ chinese = chinese + stu.getChinese(); english = english + stu.getEnglish(); math = math + stu.getMath(); } student.setChinese(chinese); student.setEnglish(english); student.setMath(math); context.write(key, student); } }
3.Student.java
package com.study.score.day01; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; public class Student implements Writable{ private String name ; private Integer chinese ; private Integer english ; private Integer math ; public String getName() { return name; } public void setName(String name) { this.name = name; } public Integer getChinese() { return chinese; } public void setChinese(Integer chinese) { this.chinese = chinese; } public Integer getEnglish() { return english; } public void setEnglish(Integer english) { this.english = english; } public Integer getMath() { return math; } public void setMath(Integer math) { this.math = math; } @Override public void readFields(DataInput input) throws IOException { this.name = input.readUTF(); this.chinese = input.readInt(); this.english = input.readInt(); this.math = input.readInt(); } @Override public void write(DataOutput output) throws IOException { output.writeUTF(name); output.writeInt(chinese); output.writeInt(english); output.writeInt(math); } @Override public String toString() { return "Student [name=" + name + ", chinese=" + chinese + ", english=" + english + ", math=" + math + "]"; } }
4.Driver.java
public class ScoreDriver { public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(ScoreDriver.class); job.setMapperClass(ScoreMapper.class); job.setReducerClass(ScoreReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Student.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Student.class); // 读取路径下的所有文件,此时 result 文件夹不存在 FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.76.131:9000/score")); FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.76.131:9000/score/result")); job.waitForCompletion(true); } }
总结:
利用 content.getInputFile().getPath().getName() 文件名称来区分成绩的归属,即是哪个科目的成绩
二、排序
备注:若使用插件上传文件后乱码,检查 Eclipse 工作空间的编码格式,设置为UTF-8与文件的编码格式相同
根据电影热度对电影排序
惊天破 72
机械师2 83
奇异博士 67
但丁密码 79
比利林恩的中场战事 84
侠探杰克:永不回头 68
龙珠Z:复活的弗利萨 79
长城 56
1.Po
package com.study.sort.day01; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class MovieBean implements WritableComparable<MovieBean>{ private String name ; private Integer hotNum ; public String getName() { return name; } public void setName(String name) { this.name = name; } public Integer getHotNum() { return hotNum; } public void setHotNum(Integer hotNum) { this.hotNum = hotNum; } @Override public void readFields(DataInput input) throws IOException { this.name = input.readUTF(); this.hotNum = input.readInt(); } @Override public void write(DataOutput output) throws IOException { output.writeUTF(this.name); output.writeInt(this.hotNum); } @Override public String toString() { return "MovieBean [name=" + name + ", hotNum=" + hotNum + "]"; } // 降序排序:旧对象 - 当前对象 @Override public int compareTo(MovieBean o) { return o.getHotNum() - this.getHotNum(); } }
2.Mapper
public class SortMapper extends Mapper<LongWritable, Text, MovieBean, NullWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, MovieBean, NullWritable>.Context context) throws IOException, InterruptedException { String line = value.toString() ; String [] datas = line.split(" "); MovieBean movieBean = new MovieBean(); movieBean.setName(datas[0]); movieBean.setHotNum(Integer.valueOf(datas[1])); context.write(movieBean, NullWritable.get()); } }
3.Driver
public class SortDriver { public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(SortDriver.class); job.setMapperClass(SortMapper.class); job.setMapOutputKeyClass(MovieBean.class); job.setMapOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.76.131:9000/sort")); FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.76.131:9000/sort/result")); job.waitForCompletion(true); } }
重点:
PO 实现 比较接口,定义比较结果
Mapper 排序是根据KEY值进行排序的,所以 PO类作为KEY值
三、多层MR处理
在第一层MR处理基础上
添加第二个JOB处理第一个JOB的运行结果
例子:
计算每人3个月的总收入并排序
第一个MR:计算每人的总收入
第二个MR:按照收入进行排序Mapper
四、combine 减轻 reducer压力
统计单词数量
若多个mapper 读取一份文件,每个mapper 对文件处理的结果为
hello 1
hello 1
...
最后发送到reducer 上进行合并处理,若文件数量很大,则reducer的压力很大
所以,在 mapper的处理过程中进行预处理,先进行合并
再发送到 reducer 上进行二次合并,此时只是对两个数据进行合并
1.Mapper
读取数据,分割,发送
2.Combine extends Reducer
3.Reducer
接收 combine 处理后的数据
4.Driver
job.serCombineClass();
不需对 combine 的输出 key value 进行设置