数据源
第一列是手机号、第二列是上传流量、第三列是下载流量
成果
类型参考
文件结构
Mapper源码
package demo;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
//参数:行号、一行数据、输出的Key、输出的value
public class FlowMapperIt extends Mapper<LongWritable, Text, Text, FlowBean> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();//获取一行数据
String[] array = line.split(" ");//对一行数据进行分割存入数组
//把数据放入对象中,对象实例化。
FlowBean f = new FlowBean(array[0], Long.parseLong(array[1]), Long.parseLong(array[2]));
/*
1.数据输出类型取决于Mapper函数继承时输入的参数
例如:上方Mapper函数的输出类型是Text和FlowBean 下方数据输出的类型必须一致
2.输出的key是分区的依据
例如:如果key是手机号开头三位,你选择分区,结果会按照手机号开头分区,像group by。
*/
context.write(new Text(f.getPhone().substring(0, 3)), f);
}
}
Reducer源码
package demo;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//参数:mapper的输出Key、mapper的输出value(partitioner的输入是mapper的输出)
public class FlowReducerIt extends Reducer<Text, FlowBean, Text, Text> {
@Override
protected void reduce(Text arg0, Iterable<FlowBean> arg1, Context arg2)
throws IOException, InterruptedException {
/*
所有相同key的数据会放在一起、所以迭代数据
例如:所有key等于131的数据会放在一起、这个key是mapper输出的key 和partionner没关系
*/
Iterator<FlowBean> i = arg1.iterator();
long up_flow = 0L;
long down_flow = 0L;
long sum_flow = 0L;
while (i.hasNext()) {
FlowBean f = i.next();
up_flow += f.getUp_flow();
down_flow += f.getDown_flow();
sum_flow += f.getSum_flow();
/*
写入数据
*/
arg2.write(arg0, new Text(up_flow + "\t" + down_flow + "\t" + sum_flow));
}
}
}
Partitioner源码
package demo;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
//参数:mapper的输出Key、mapper的输出value(partitioner的输入是mapper的输出)
public class FlowPartioner extends Partitioner<Text, FlowBean> {
/*设置分区依据
例如:如果手机号是131开头的 则分到0区
*/
static Map<String, Integer> map = new HashMap<String, Integer>();
static {
map.put("131", 0);
map.put("186", 1);
map.put("137", 2);
map.put("138", 3);
}
@Override
public int getPartition(Text arg0, FlowBean arg1, int arg2) {
//获取手机号的前三位
String phoneNum = arg0.toString();
//通过手机号前三位查询这条数据应该分配到哪个区域
Integer code = map.get(phoneNum);
//没有匹配到数据全部放在一个分区中(杂乱数据)
if (code == null) {
return map.size() + 1;
}
//把该条数据放到这个分区
return code;
}
}
Main源码
package demo;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FlowPartionalMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 配置初始化
Configuration conf = new Configuration();
Job wcjob = Job.getInstance(conf);
// 指定job所在的jar包
wcjob.setJarByClass(FlowPartionalMain.class);
// 指定mapper和reducer所在的类
wcjob.setMapperClass(FlowMapperIt.class);
wcjob.setReducerClass(FlowReducerIt.class);
//指定parttioner所在的类
wcjob.setPartitionerClass(FlowPartioner.class);
//指定reduce的个数(分区的数量)
wcjob.setNumReduceTasks(5);
// mapper输出key和value的类型
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setMapOutputValueClass(FlowBean.class);
// reducer输出key和value的类型
wcjob.setOutputKeyClass(Text.class);
wcjob.setOutputValueClass(Text.class);
// 文件的位置
FileInputFormat.setInputPaths(wcjob, new Path("/Users/apple/Downloads/QQ下载/flow.txt"));
//ָMR处理后结果文件的位置(满足两个条件 1、文件夹 2、这个文件夹不存在)
FileOutputFormat.setOutputPath(wcjob, new Path("/Users/apple/Downloads/phone/"));
//向yarn集群提交这个job
boolean res = wcjob.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
Bean源码
package demo;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class FlowBean implements WritableComparable<FlowBean> {
private String phone;
private long up_flow;
private long down_flow;
private long sum_flow;
/*无参构造函数(不可缺少)
在反序列化时,反射机制需要调用空参数构造函数,所以定义一个空参构造函数
*/
public FlowBean() {
super();
}
//有参构造函数
public FlowBean(String phone, long up_flow, long down_flow) {
super();
this.phone = phone;
this.up_flow = up_flow;
this.down_flow = down_flow;
this.sum_flow = this.up_flow + this.down_flow;
}
/*
get和set
*/
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getDown_flow() {
return down_flow;
}
public void setDown_flow(long down_flow) {
this.down_flow = down_flow;
}
public long getSum_flow() {
return sum_flow;
}
public void setSum_flow(long sum_flow) {
this.sum_flow = sum_flow;
}
///将对象数据序列化到流中
@Override
public void write(DataOutput arg0) throws IOException {
arg0.writeUTF(phone);
arg0.writeLong(up_flow);
arg0.writeLong(down_flow);
arg0.writeLong(sum_flow);
}
/*
从数据流中反序列出对象的数据
PS:从数据流中读出对象字段时,必须跟序列化时的顺序保持一致
*/
@Override
public void readFields(DataInput arg0) throws IOException {
this.phone = arg0.readUTF();
this.up_flow = arg0.readLong();
this.down_flow = arg0.readLong();
this.sum_flow = arg0.readLong();
}
//按某个字段(属性)排序 下方是升序、反之是降序
@Override
public int compareTo(FlowBean o) {
return sum_flow > o.getSum_flow() ? 1 : -1;
}
}