三、全排序
设计思想:实现全排序,我们可以利用Map任务对自己输入的任务进行排序,然后进行全局排序,但是这种设计导致Reduce任务的个数只能是1,并行度不高。针对该问题,我们利用分区,进行多分区排序的方法。利用Hadoop自带的TotalOrderPatitioner,为排序作业创建分区,在创建分区之前需要使用Hadoop默认的抽样器先对其抽样,根据数据分布生成分区文件,分区中数据的范围需要通过分区文件来指定。这样就保证了每个任务处理的都是连续区间的数据。
package com.hadoop.totalsort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler.RandomSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
public class TotalSort {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
//分区文件路径
Path partitionFile = new Path(args[2]);
//reduce任务数
int reduceNumer = Integer.parseInt(args[3]);
/*参数说明
* 第一个参数:会被选中的概率
* 第二个参数:选取的样本数
* 第三个参数:最大读取InputSplit数
*
* */
RandomSampler<Text, Text> sampler = new InputSampler.RandomSampler<Text,Text>(0.1, 10000, 10);
Configuration conf = new Configuration();
TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
Job job = new Job(conf);
job.setJobName("TotalSort");
job.setJarByClass(TotalSort.class);
//数据文件默认以\t分割
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setNumReduceTasks(reduceNumer);
job.setPartitionerClass(TotalOrderPartitioner.class);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
//如果输出目录存在则删除
outputPath.getFileSystem(conf).delete(outputPath,true);
InputSampler.writePartitionFile(job, sampler);
System.out.println(job.waitForCompletion(true)?0:1);
}
}
四、合并多个小文件
在前面的文章中我们就说过,hdfs用于存储大的数据集,如果小文件的数量太多,是对资源的一种浪费。那么,我们可以使用CombineTextInputFormat.setMaxInputSplitSize设置切片的大小,将小文件合并为大文件。
package com.hadoop.combinefile;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SmallFileCombinerMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
NullWritable v = NullWritable.get();
protected void map(LongWritable key,Text vakue,Context context) throws IOException, InterruptedException{
context.write(vakue, v);
}
}
package com.hadoop.combinefile;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = new Job(conf,"smallFileCombine");
job.setJarByClass(Driver.class);
job.setMapperClass(SmallFileCombinerMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//将小文件划分为一个切片
job.setInputFormatClass(CombineTextInputFormat.class);
//设置切片的大小,如果多个文件总和小于150M则会被合并
CombineTextInputFormat.setMaxInputSplitSize(job, 1024*1024*150);
CombineTextInputFormat.setInputPaths(job, new Path("hdfs://10.128.0.130:9000/user/root/CombineInput"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://10.128.0.130:9000/user/root/output9"));
job.setNumReduceTasks(0);
System.out.println("OK");
System.out.println(job.waitForCompletion(true) ? 0 : 1);
}
}
五、数据去重
数据去重的目的是纬二路去除文件中重复的数据,在mapreduce中我们可以利用map的输出经过shuffle过程聚集成<key,value-list>的形式。因此,我们只需要将map阶段的value输出为key,value任意;在reduce阶段只将key输出,value置为空就可以了。
package com.hadoop.DeWeighting;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class DeMap extends Mapper<Object, Text, Text, Text>{
public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
context.write(value, new Text(""));
}
}
package com.hadoop.DeWeighting;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DeReduce extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
context.write(key, new Text(""));
}
}
package com.hadoop.DeWeighting;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = new Job(conf,"DeWeight");
job.setJarByClass(Driver.class);
job.setMapperClass(DeMap.class);
job.setReducerClass(DeReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path("hdfs://10.128.0.130:9000/user/root/DeWeighting"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://10.128.0.130:9000/user/root/output10"));
System.out.println("OK");
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}