一自定义jar的流程

配置相关的内容
自定义map输出的k,v类（此类必须实现WritableComparable序列化和比较器接口，实现序列化，反序列化和通用排序方法）
自定义Map类（必须继承Mapper类，重写map函数）
自定义数据在map中排序方法（必须继承WritableComparator，实现相关方法）
设置reducetask的数量（默认为一个reducetask）
自定义分区方法（必须继承Partitioner，实现getPartition方法）。可选
如果按照步骤4的排序进行分组，不符合需求时，可以改变分组的边界，自定义分组类（必须继承WritableComparator，实现相关方法）
自定义reduce类（必须继承Reducer，实现reduce方法）
提交job作业

二具体示例

需求：
在下面数据中找到每个月份中温度最高的两天，并输出高温的年月日和温度
1949-10-01 14:21:02 34c
1949-10-01 19:21:02 38c
1949-10-02 14:01:02 36c
1950-01-01 11:21:02 32c
1950-10-01 12:21:02 37c
1951-12-01 12:21:02 23c
1950-10-02 12:21:02 41c
1950-10-03 12:21:02 27c
1951-07-01 12:21:02 45c
1951-07-02 12:21:02 46c
1951-07-03 12:21:03 47c

思路：把每行字符串进行切割，切割成年月日和温度，分别用自定义天气类保存。把数据按照年月正序，温度倒序的方式对数据进行排序。相同的年月为一组，reduce端对每组数据进行遍历，第一个一定是最高温，然后比较后面的温度，且不能为同一天的数据。

2.1主方法（配置）

package com.xjq.tianqi;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TqMain {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		// 1.配置
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		//设置job作业类
		job.setJarByClass(TqMain.class);
		//设置job作业名称
		job.setJobName("tianqi");
		
		//2.设置输入输出路径
		//设置输入路径FileInputFormat.addInputPath(job, inpath);
		Path inpath = new Path("/tq/input");
		org.apache.hadoop.mapreduce.lib.input.FileInputFormat
		.addInputPath(job, inpath);
		 
		//设置输出路径FileOutputFormat.setOutputPath(job, outpath );
		//    注意：输出路径必须为空
		Path outpath = new Path("/tq/output");
		if(outpath.getFileSystem(conf).exists(outpath))
			outpath.getFileSystem(conf).delete(outpath, true);
		FileOutputFormat.setOutputPath(job, outpath );
		//3.设置mapper类
		job.setMapperClass(TqMapper.class);
		job.setMapOutputKeyClass(Tq.class);
		job.setMapOutputValueClass(IntWritable.class);
	
		//4.排序比较器
		job.setSortComparatorClass(TqSortComp.class);
		
		//5.分区器（可选，默认一个分区）
		job.setPartitionerClass(TqPartition.class);
		
		//6.设置组排序器 (不实现就会按着TqsortComp的排序分组，即年月温度相同的为一组)
		job.setGroupingComparatorClass(TqGroupComp.class);
		
		//7.设置reduce的数量即分区的个数
		job.setNumReduceTasks(2);
	
		//8.设置reduce
		job.setReducerClass(TqReduce.class);
		
		//9.提交
		job.waitForCompletion(true);
	}
}

2.2自定义map输出的key。自定一个天气类

属性：year，month，day，wd（温度）
注意：序列化和反序列化的顺序问题和数值类型

package com.xjq.tianqi;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;

//WritableComparable:实现序列化和比较器的一个接口
public class Tq implements WritableComparable<Tq>{
	
	private int year;
	private int month;
	private int day;
	private int wd;
	
	

	public Tq() {
		super();
	}

	public int getYear() {
		return year;
	}

	public void setYear(int year) {
		this.year = year;
	}

	public int getMonth() {
		return month;
	}

	public void setMonth(int month) {
		this.month = month;
	}

	public int getDay() {
		return day;
	}

	public void setDay(int day) {
		this.day = day;
	}

	public int getWd() {
		return wd;
	}

	public void setWd(int wd) {
		this.wd = wd;
	}
	
	

	@Override
	public String toString() {
		return  year + "-" + month + "-" + day;
	}

	/*
	 * 反序列化函数
	 * 			注意：必须和序列化函数的顺序一致，并且对应的参数类型也必须一致
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		this.setYear(in.readInt());
		this.setMonth(in.readInt());
		this.setDay(in.readInt());
		this.setWd(in.readInt());
	}
	/*
	 * 序列化函数
	 * 			注意：参数的类型 	 
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(this.getYear());
		out.writeInt(this.getMonth());
		out.writeInt(this.getDay());
		out.writeInt(this.getWd());
	}

	/*
	 *通用比较器函数 (年月日正序)
	 */
	@Override
	public int compareTo(Tq tq) {
		int c1 = Integer.compare(this.getYear(), tq.getYear());
		if (c1==0) {
			int c2 = Integer.compare(this.getMonth(), tq.getMonth());
			if (c2==0) {
				return Integer.compare(this.getDay(), tq.getDay());
			}
			return c2;
		}
		return c1;
	}

}

2.3设置map类

package com.xjq.tianqi;

import java.io.IOException;
import java.sql.Date;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.jboss.netty.util.internal.StringUtil;

public class TqMapper extends Mapper<LongWritable, Text, Tq, IntWritable> {

	Tq tq = new Tq();
	IntWritable tval = new IntWritable();

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		//切割一行数据
		String[] strings = StringUtil.split(value.toString(), '\t');

		String pattern = "yyyy-MM-dd";
		SimpleDateFormat slf = new SimpleDateFormat(pattern);
		try {
			//切割时间字符串，去除时分秒
			java.util.Date date = slf.parse(strings[0]);
			Calendar calendar = Calendar.getInstance();
			calendar.setTime(date);
			tq.setYear(calendar.get(Calendar.YEAR));
			tq.setMonth(calendar.get(Calendar.MONTH) + 1);
			tq.setDay(calendar.get(Calendar.DAY_OF_MONTH));
			//切割温度字符串，去除c
			int wd=Integer.parseInt(strings[1].substring(0, strings[1].lastIndexOf("c")));

			tq.setWd(wd);
			tval.set(wd);

			context.write(tq, tval);

		} catch (ParseException e) {
			e.printStackTrace();
		}

	}

}

2.4设置自定义排序器

把数据按照年月正序，温度倒序排列

package com.xjq.tianqi;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;



public class TqSortComp extends WritableComparator{
	Tq tq1=null;
	Tq tq2=null;
	
	
	//调用父类的构造函数为tq1，2创建实例
	public TqSortComp() {
		super(Tq.class,true);
	}



	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		tq1 = (Tq) a;
		tq2 = (Tq) b;
		
		int c1 = Integer.compare(tq1.getYear(), tq2.getYear());
		if(c1==0){
			int c2 = Integer.compare(tq1.getMonth(), tq2.getMonth());
			if(c2==0){
				return -Integer.compare(tq1.getWd(), tq2.getWd());
			}
			return c2;
		}
		return c1;
	}

	
}

2.5设置reudceTask的数量

看2.1中的第5点

2.6自定义分区器

package com.xjq.tianqi;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;

public class TqPartition extends Partitioner<Tq, IntWritable>{

	@Override
	public int getPartition(Tq tq, IntWritable arg1, int num) {
		return tq.getYear()%num;
	}

}

2.7设置分组边界

由于按找map的排序方法，会把相同年月温度的一条数据分为一组，不符合需求，所以需要重新界定分组边界。

package com.xjq.tianqi;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TqGroupComp extends WritableComparator{
	Tq tq1=null;
	Tq tq2=null;
	
	
	//调用父类的构造函数为tq1，2创建实例
	public TqGroupComp() {
		super(Tq.class,true);
	}



	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		tq1 = (Tq) a;
		tq2 = (Tq) b;
		
		int c1 = Integer.compare(tq1.getYear(), tq2.getYear());
		if(c1==0){
			return Integer.compare(tq1.getMonth(), tq2.getMonth());
			
		}
		return c1;
	}

}

2.8自定义reduce类

实现找出每个月中温度最高的两天，剔除同一天多个高温的情况

package com.xjq.tianqi;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TqReduce extends Reducer<Tq, IntWritable, Text, IntWritable>{

	Text tkey =new Text();
	IntWritable tvalue = new IntWritable();
	@Override
	protected void reduce(Tq tq, Iterable<IntWritable> tvals, Context context)
			throws IOException, InterruptedException {
		int flag=0;
		int date=0;
		for (IntWritable tval : tvals) {
			if(flag==0){
				tkey.set(tq.toString());
				tvalue.set(tval.get());
				context.write(tkey, tvalue);
				flag++;
				date = tq.getDay();
			}
			if(flag!=0&&date!=tq.getDay()){
				tkey.set(tq.toString()); 
				tvalue.set(tval.get());
				context.write(tkey, tvalue);
				return;
			}
		}
		
	}

	
	
}

BigDate_小学生

发布了19 篇原创文章 · 获赞 1 · 访问量 330

私信关注

MapReduce自定义Job示例一：高温统计

一自定义jar的流程

二具体示例

2.1主方法（配置）

2.2自定义map输出的key。自定一个天气类

2.3设置map类

2.4设置自定义排序器

2.5设置reudceTask的数量

2.6自定义分区器

2.7设置分组边界

2.8自定义reduce类

猜你喜欢

MapReduce自定义Job示例一：高温统计

一 自定义jar的流程

二 具体示例

2.1主方法（配置）

2.2自定义map输出的key。自定一个天气类

2.3设置map类

2.4设置自定义排序器

2.5设置reudceTask的数量

2.6自定义分区器

2.7设置分组边界

2.8自定义reduce类

猜你喜欢

一自定义jar的流程

二具体示例