Hadoop——topN

本节目标:

通过一个求topN的案例,掌握MR的开发流程。


根据已知的数据集,数据集每一行的文本内容是不同年月和时间对应的温度。

 

Q:求每年每月中出现的最高的两个温度值。

分析:年月、时间升序,温度降序。将年月分组,具有相同年月的数据分组到一起,然后时间按照升序排列,温度降序排列,取前两个。

/**
 * Job主类
 * 设置与job任务相关的所有信息,提交job任务
 * @author devin
 */
public class TempJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//本地测试环境,手动设置(Active NN,Active RM)配置信息
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://master:8020");
conf.set("yarn.resourcemanager.hostname", "slave3");
//实例化job
Job job = Job.getInstance(conf);
//job入口
job.setJarByClass(TempJob.class);
//设置map相关信息,包括自定义的map类,map输出key的类型,输出value的类型
job.setMapperClass(TempMap.class);
job.setMapOutputKeyClass(Weather.class);
job.setMapOutputValueClass(IntWritable.class);
//设置job的shuffle过程操作信息,包括分区,排序,分组
job.setPartitionerClass(TempPartition.class);
job.setSortComparatorClass(TempSort.class);
job.setGroupingComparatorClass(TempGroup.class);
//设置reduce相关信息,包括reduce任务个数,自定义的reduce类
job.setNumReduceTasks(3);
job.setReducerClass(TempReduce.class);
//设置要处理的文件
Path inPath = new Path("/temperature/input/temp2.txt");
FileInputFormat.addInputPath(job, inPath);
//设置最终结果保存的路径
Path outPath = new Path("/temperature/output");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
//提交job,等待完成!
boolean flag = job.waitForCompletion(true);
if (flag) {
System.out.println("Job success!");
}
}
}


/**
 * 自定义map函数的类 主要功能:读取文本中的每行数据,提取每行数据中的可用属性(年月日,时间,温度),封装成javabean,
 * 定义排序规则,作为map输出的key,输出的值为对应的温度。
 * 
 * @author devin
 *
 */
public class TempMap extends Mapper<LongWritable, Text, Weather, IntWritable> {
/**
* key:偏移量 
* value:每行文本内容 
* context:输出内容上下文
*/
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
try {

// 分割每行数据,提取javabean属性
String[] str = StringUtils.split(value.toString(), "\t");

String dataTime = str[0];
String tempVal = str[1];
//每行数据封装成一个Weather对象,设置每个对象的属性值
Weather w = new Weather();

SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
Date date = sdf.parse(dataTime);
Calendar cal = Calendar.getInstance();
cal.setTime(date);

w.setYear(cal.get(cal.YEAR));
w.setMonth(cal.get(cal.MONTH) + 1);
w.setDay(cal.get(cal.DAY_OF_MONTH));
int temperature = Integer.parseInt(tempVal.substring(0,tempVal.lastIndexOf("c")));
w.setTemperature(temperature);

// map输出结果<weather,temperature>
context.write(w, new IntWritable(temperature));
} catch (ParseException e) {
e.printStackTrace();
}
}
}
/**
 * 封装了可用于排序的相关属性
 * 实现WritableComparable,使该类可序列化反序列化,用于机器间的数据传输;可比较,用于排序
 */
public class Weather implements WritableComparable<Weather> {
//封装要排序的属性
private int year;
private int month;
private int day;
private int temperature;//温度

public int getYear() {
return year;
}

public void setYear(int year) {
this.year = year;
}

public int getMonth() {
return month;
}

public void setMonth(int month) {
this.month = month;
}

public int getDay() {
return day;
}

public void setDay(int day) {
this.day = day;
}

public int getTemperature() {
return temperature;
}

public void setTemperature(int temperature) {
this.temperature = temperature;
}

//反序列化
@Override
public void readFields(DataInput input) throws IOException {
this.year = input.readInt();
this.month = input.readInt();
this.day = input.readInt();
this.temperature = input.readInt();
}
//序列化
@Override
public void write(DataOutput output) throws IOException {
output.writeInt(this.year);
output.writeInt(this.month);
output.writeInt(this.day);
output.writeInt(this.temperature);
}
/**
* 自定义对象排序规则:
* 先按year属性升序排序
* 如果year属性值相同,按month属性升序排序
* 如果year属性值相同,按temperature属性降序排序
*/
@Override
public int compareTo(Weather o) {
int i = Integer.compare(this.year, o.getYear());
if (i == 0) {
int j = Integer.compare(this.month, o.getMonth());
if (j == 0) {
return -Integer.compare(this.temperature, o.getTemperature());
}
return j;
}
return i;
}
}

/**
* 自定义分区规则:
* 简单为好
*/
public class TempPartition extends HashPartitioner<Weather, IntWritable> {

@Override
public int getPartition(Weather weather, IntWritable value,int numReduceTasks) {
return (weather.getYear() - 1949) % numReduceTasks;
}
}
/**
 * 根据自定义分区方式分区后,需要将各分区中数据按一定规则排序。
 * 排序后,map端的shuffle完成(溢写在此先忽略)
 */
public class TempSort extends WritableComparator {
//需要在构造方法中调用父类构造,传入key的类型,以及是否创建实例
public TempSort() {
super(Weather.class,true);
}

@Override
public int compare(WritableComparable k1, WritableComparable k2) {
Weather w1 = (Weather) k1;
Weather w2 = (Weather) k2;
//Weather对象的排序规则在创建类的时候已定义
return w1.compareTo(w2);
//也可以通过以下方式定义
// int i = Integer.compare(w1.getYear(), w2.getYear());
// if (i == 0) {
// int j = Integer.compare(w1.getMonth(), w2.getMonth());
// if (j == 0) {
// return -Integer.compare(w1.getTemperature(), w2.getTemperature());
// }
// return j;
// }
// return i;
}
}
/**
 * reduce端的shuffle过程,从map端领取数据
 * 通过自定义分组方法,进行分组
 * @author lenovo
 *
 */
public class TempGroup extends WritableComparator {

public TempGroup() {
super(Weather.class,true);
}

/**
* 自定义分组方式:
* 按照对象的year、month属性分组,具有相同year和month属性的对象会被分到一组
*/
@Override
public int compare(WritableComparable k1, WritableComparable k2) {
Weather w1 = (Weather) k1;
Weather w2 = (Weather) k2;

int i = Integer.compare(w1.getYear(), w2.getYear());
if (i == 0) {
return Integer.compare(w1.getMonth(), w2.getMonth());
}
return i;
}
}
/**
 * 自定义reduce函数
 * 输入的key:分组后的javabean对象
 * 输入的value:每个对象对应的值(温度)
 * 输出的key:结果字符串
 * 输出的value:null
 */
public class TempReduce extends Reducer<Weather, IntWritable, Text, NullWritable> {

@Override
protected void reduce(Weather weather, Iterable<IntWritable> iterable,Context context)
throws IOException, InterruptedException {
//输出top2
int num = 0;
for (IntWritable iter : iterable) {
num++;
if (num >2) {
break;
}
String str = weather.getYear()+"-" + weather.getMonth() + "-"+weather.getDay()+"\t" + iter.get();
context.write(new Text(str), NullWritable.get());
}
}
}

允许结果:



























猜你喜欢

转载自blog.csdn.net/ym01213/article/details/80051453