hadoop之自定义分区类+采样器实现全排序

全排序
1.自定义partitioner规则类
public class Myparttitioner extends Partitioner<IntWritable,IntWritable> {
/*
分区函数:
keyIn ValueIn
分区编号: 0 1 2
每个区都是排好序的区与区之间也是排好序的
*/
public int getPartition(IntWritable year,
IntWritable temp,
int numPartitions) {
int x = year.get() - 1970 ;
if(x < 17) { //进行了全排序
return 0;
} else if (x >= 17 && x < 34) {
return 1;
} else {
return 2;
}
}
}

2.mapper编写
public class MapperSort extends Mapper<LongWritable,Text,IntWritable,IntWritable> {
//input: key value
// 0 1998 17
//out: 1998 17
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line=value.toString();
String[] arrs=line.split(" ");
context.write(
new IntWritable(Integer.parseInt(arrs[0])),
new IntWritable(Integer.parseInt(arrs[1]))
);

}

3.reducer编写
public class ReducerSort extends Reducer<IntWritable, IntWritable,IntWritable,IntWritable>{
protected void reduce(IntWritable year,
Iterable temps,
Context context) throws IOException,InterruptedException{
int max=Integer.MIN_VALUE;
IntWritable out = new IntWritable();
for(IntWritable iw : temps){
max=iw.get() > max ? iw.get():max;
}
out.set(max);
context.write(year,out);
}
}

4.数据生成：
public class DataProduct {
@Test
public void test() throws Exception{
//创建文件字符输出流
FileWriter fw=new FileWriter(“C:\Users\shinelon\Desktop\b.txt”);
for (int i=0;i<=100000;i++){
//0-51之间的数字的随机数
int year=1970 + new Random().nextInt(51);
//-15-44
int temp=-15+new Random().nextInt(59);
String data =year +" “+ temp +”\r\n";
fw.write(data);
}
fw.close();

}
@Test
public void data() throws IOException{
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");
    FileSystem fs = FileSystem.get(conf);
    Path p = new Path("C:\\Users\\shinelon\\Desktop\\b1.seq");
    SequenceFile.Writer writer
            =SequenceFile.createWriter(fs,conf,p, IntWritable.class,IntWritable.class);
    for(int i = 0 ; i < 6000 ; i ++){
        int year = 1970 + new Random().nextInt(100);
        int temp = -30 + new Random().nextInt(100);
        writer.append(new IntWritable(year),new IntWritable(temp));
    }
    writer.close();
}

}

5.编写driver
public class app {
public static void main(String args[])throws Exception{
Configuration conf = new Configuration() ;
Job job= Job.getInstance(conf);
job.setPartitionerClass(Myparttitioner.class);//设置分区类,新增额
job.setMapperClass(MapperSort.class);
job.setReducerClass(ReducerSort.class);
job.setMapOutputValueClass(IntWritable.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setNumReduceTasks(3);//设置reduce个数，新增
FileInputFormat.setInputPaths(job,new Path(“C:\Users\shinelon\Desktop\b.txt”));
FileOutputFormat.setOutputPath(job,new Path(“C:\Users\shinelon\Desktop\hahaha111”));
job.waitForCompletion(true);
}
}

2.采样器进行全排序：
（1）map编写
package SamplerSort;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class SamperMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {
//序列文件
//第一个Intwritable 代表的就是key 对应空格前面的数
//第二个Intwritable 代表的就是value 对应空格后面的数
protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
context.write(key, value);
}
}
（2）reduce编写
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SamperReducer extends Reducer<IntWritable,IntWritable,IntWritable,IntWritable> {
@Override
protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {

    int max = Integer.MIN_VALUE;
    for(IntWritable iw :values){
        max = max > iw.get()? max : iw.get();
    }
    context.write(key,new IntWritable(max));

}

}

（3)samperAPP
public class SamperApp {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//单例作业
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置job的各种属性 //设置job名称//设置搜索类
job.setInputFormatClass(SequenceFileInputFormat.class);
//设置输入路径
FileInputFormat.addInputPath(job,new Path(“C:\Users\shinelon\Desktop\b1.seq”));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(“C:\Users\shinelon\Desktop\out11”));
job.setMapperClass(SamperMapper.class); //设置mapper类
job.setReducerClass(SamperReducer.class); //设置reduecer类
job.setMapOutputKeyClass(IntWritable.class); //设置之map输出key
job.setMapOutputValueClass(IntWritable.class); //设置map输出value
job.setOutputKeyClass(IntWritable.class); //设置mapreduce 输出ke
job.setOutputValueClass(IntWritable.class); //设置mapreduce输出value
//创建随机采样对象
/**
* RandomSampler
* 1:每个key被选中的概率
* 6000：抽取样本的总数
* 3：最大采样切片数分区数
*/
InputSampler.Sampler<LongWritable,IntWritable> sampler =
new InputSampler.RandomSampler<LongWritable, IntWritable>(0.1,10000,10);
job.setNumReduceTasks(3); //设置reduce个数

    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:\\fdo\\par.list"));
    //设置全排序分区类
    job.setPartitionerClass(TotalOrderPartitioner.class);
    InputSampler.writePartitionFile(job,sampler);
    job.waitForCompletion(true);

}

}

自定义partition规则类，与使用采样器的区别;
（1）自定义的可以根据自己的设定来设置分区规则，比如取模和做差
public int getPartition(IntWritable year,//自定义分区规则
IntWritable temp,
int numPartitions) {
int x = year.get() - 1970 ;
if(x < 17) { //shuffle里面进行排序
return 0;
} else if (x >= 17 && x < 34) {
return 1;
} else {
return 2;
}
}
（2）采样器设置
//创建随机采样对象
/**

RandomSampler
0.1:每个key被选中的概率
10000：抽取样本的总数
10：最大采样切片数分区数
/
InputSampler.Sampler<LongWritable,IntWritable> sampler =
new InputSampler.RandomSampler<LongWritable, IntWritable>(0.1,10000,10);
job.setNumReduceTasks(3); //设置reduce个数
//将sample数据写入分区文件
/*
job.getConfiguration()注意这里的conf 不是之前的new conf() 是通过job.getConfiguration()
*/
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path(“D:\fdo\par.list”));
//设置全排序分区类
job.setPartitionerClass(TotalOrderPartitioner.class);
InputSampler.writePartitionFile(job,sampler);

hadoop之自定义分区类+采样器实现全排序

猜你喜欢