一 自定义jar的流程
- 配置相关的内容
- 自定义map输出的k,v类(此类必须实现WritableComparable序列化和比较器接口,实现序列化,反序列化和通用排序方法)
- 自定义Map类(必须继承Mapper类,重写map函数)
- 自定义数据在map中排序方法(必须继承WritableComparator,实现相关方法)
- 设置reducetask的数量(默认为一个reducetask)
- 自定义分区方法(必须继承Partitioner,实现getPartition方法)。可选
- 如果按照步骤4的排序进行分组,不符合需求时,可以改变分组的边界,自定义分组类(必须继承WritableComparator,实现相关方法)
- 自定义reduce类(必须继承Reducer,实现reduce方法)
- 提交job作业
二 具体示例
需求:
在下面数据中找到每个月份中温度最高的两天,并输出高温的年月日和温度
1949-10-01 14:21:02 34c
1949-10-01 19:21:02 38c
1949-10-02 14:01:02 36c
1950-01-01 11:21:02 32c
1950-10-01 12:21:02 37c
1951-12-01 12:21:02 23c
1950-10-02 12:21:02 41c
1950-10-03 12:21:02 27c
1951-07-01 12:21:02 45c
1951-07-02 12:21:02 46c
1951-07-03 12:21:03 47c
思路:把每行字符串进行切割,切割成年月日和温度,分别用自定义天气类保存。把数据按照年月正序,温度倒序的方式对数据进行排序。相同的年月为一组,reduce端对每组数据进行遍历,第一个一定是最高温,然后比较后面的温度,且不能为同一天的数据。
2.1主方法(配置)
package com.xjq.tianqi;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TqMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1.配置
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置job作业类
job.setJarByClass(TqMain.class);
//设置job作业名称
job.setJobName("tianqi");
//2.设置输入输出路径
//设置输入路径FileInputFormat.addInputPath(job, inpath);
Path inpath = new Path("/tq/input");
org.apache.hadoop.mapreduce.lib.input.FileInputFormat
.addInputPath(job, inpath);
//设置输出路径FileOutputFormat.setOutputPath(job, outpath );
// 注意:输出路径必须为空
Path outpath = new Path("/tq/output");
if(outpath.getFileSystem(conf).exists(outpath))
outpath.getFileSystem(conf).delete(outpath, true);
FileOutputFormat.setOutputPath(job, outpath );
//3.设置mapper类
job.setMapperClass(TqMapper.class);
job.setMapOutputKeyClass(Tq.class);
job.setMapOutputValueClass(IntWritable.class);
//4.排序比较器
job.setSortComparatorClass(TqSortComp.class);
//5.分区器(可选,默认一个分区)
job.setPartitionerClass(TqPartition.class);
//6.设置组排序器 (不实现就会按着TqsortComp的排序分组,即年月温度相同的为一组)
job.setGroupingComparatorClass(TqGroupComp.class);
//7.设置reduce的数量即分区的个数
job.setNumReduceTasks(2);
//8.设置reduce
job.setReducerClass(TqReduce.class);
//9.提交
job.waitForCompletion(true);
}
}
2.2自定义map输出的key。自定一个天气类
属性:year,month,day,wd(温度)
注意:序列化和反序列化的顺序问题和数值类型
package com.xjq.tianqi;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
//WritableComparable:实现序列化和比较器的一个接口
public class Tq implements WritableComparable<Tq>{
private int year;
private int month;
private int day;
private int wd;
public Tq() {
super();
}
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public int getMonth() {
return month;
}
public void setMonth(int month) {
this.month = month;
}
public int getDay() {
return day;
}
public void setDay(int day) {
this.day = day;
}
public int getWd() {
return wd;
}
public void setWd(int wd) {
this.wd = wd;
}
@Override
public String toString() {
return year + "-" + month + "-" + day;
}
/*
* 反序列化函数
* 注意:必须和序列化函数的顺序一致,并且对应的参数类型也必须一致
*/
@Override
public void readFields(DataInput in) throws IOException {
this.setYear(in.readInt());
this.setMonth(in.readInt());
this.setDay(in.readInt());
this.setWd(in.readInt());
}
/*
* 序列化函数
* 注意:参数的类型
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(this.getYear());
out.writeInt(this.getMonth());
out.writeInt(this.getDay());
out.writeInt(this.getWd());
}
/*
*通用比较器函数 (年月日正序)
*/
@Override
public int compareTo(Tq tq) {
int c1 = Integer.compare(this.getYear(), tq.getYear());
if (c1==0) {
int c2 = Integer.compare(this.getMonth(), tq.getMonth());
if (c2==0) {
return Integer.compare(this.getDay(), tq.getDay());
}
return c2;
}
return c1;
}
}
2.3设置map类
package com.xjq.tianqi;
import java.io.IOException;
import java.sql.Date;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.jboss.netty.util.internal.StringUtil;
public class TqMapper extends Mapper<LongWritable, Text, Tq, IntWritable> {
Tq tq = new Tq();
IntWritable tval = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//切割一行数据
String[] strings = StringUtil.split(value.toString(), '\t');
String pattern = "yyyy-MM-dd";
SimpleDateFormat slf = new SimpleDateFormat(pattern);
try {
//切割时间字符串,去除时分秒
java.util.Date date = slf.parse(strings[0]);
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
tq.setYear(calendar.get(Calendar.YEAR));
tq.setMonth(calendar.get(Calendar.MONTH) + 1);
tq.setDay(calendar.get(Calendar.DAY_OF_MONTH));
//切割温度字符串,去除c
int wd=Integer.parseInt(strings[1].substring(0, strings[1].lastIndexOf("c")));
tq.setWd(wd);
tval.set(wd);
context.write(tq, tval);
} catch (ParseException e) {
e.printStackTrace();
}
}
}
2.4设置自定义排序器
把数据按照年月正序,温度倒序排列
package com.xjq.tianqi;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TqSortComp extends WritableComparator{
Tq tq1=null;
Tq tq2=null;
//调用父类的构造函数为tq1,2创建实例
public TqSortComp() {
super(Tq.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
tq1 = (Tq) a;
tq2 = (Tq) b;
int c1 = Integer.compare(tq1.getYear(), tq2.getYear());
if(c1==0){
int c2 = Integer.compare(tq1.getMonth(), tq2.getMonth());
if(c2==0){
return -Integer.compare(tq1.getWd(), tq2.getWd());
}
return c2;
}
return c1;
}
}
2.5设置reudceTask的数量
看2.1中的第5点
2.6自定义分区器
package com.xjq.tianqi;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class TqPartition extends Partitioner<Tq, IntWritable>{
@Override
public int getPartition(Tq tq, IntWritable arg1, int num) {
return tq.getYear()%num;
}
}
2.7设置分组边界
由于按找map的排序方法,会把相同年月温度的一条数据分为一组,不符合需求,所以需要重新界定分组边界。
package com.xjq.tianqi;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class TqGroupComp extends WritableComparator{
Tq tq1=null;
Tq tq2=null;
//调用父类的构造函数为tq1,2创建实例
public TqGroupComp() {
super(Tq.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
tq1 = (Tq) a;
tq2 = (Tq) b;
int c1 = Integer.compare(tq1.getYear(), tq2.getYear());
if(c1==0){
return Integer.compare(tq1.getMonth(), tq2.getMonth());
}
return c1;
}
}
2.8自定义reduce类
实现找出每个月中温度最高的两天,剔除同一天多个高温的情况
package com.xjq.tianqi;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TqReduce extends Reducer<Tq, IntWritable, Text, IntWritable>{
Text tkey =new Text();
IntWritable tvalue = new IntWritable();
@Override
protected void reduce(Tq tq, Iterable<IntWritable> tvals, Context context)
throws IOException, InterruptedException {
int flag=0;
int date=0;
for (IntWritable tval : tvals) {
if(flag==0){
tkey.set(tq.toString());
tvalue.set(tval.get());
context.write(tkey, tvalue);
flag++;
date = tq.getDay();
}
if(flag!=0&&date!=tq.getDay()){
tkey.set(tq.toString());
tvalue.set(tval.get());
context.write(tkey, tvalue);
return;
}
}
}
}