MapReduce案例(中)

三、全排序

        设计思想:实现全排序,我们可以利用Map任务对自己输入的任务进行排序,然后进行全局排序,但是这种设计导致Reduce任务的个数只能是1,并行度不高。针对该问题,我们利用分区,进行多分区排序的方法。利用Hadoop自带的TotalOrderPatitioner,为排序作业创建分区,在创建分区之前需要使用Hadoop默认的抽样器先对其抽样,根据数据分布生成分区文件,分区中数据的范围需要通过分区文件来指定。这样就保证了每个任务处理的都是连续区间的数据。

package com.hadoop.totalsort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler.RandomSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

public class TotalSort {
	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		//分区文件路径
		Path partitionFile = new Path(args[2]);
		//reduce任务数
		int reduceNumer = Integer.parseInt(args[3]);
		/*参数说明
		 * 第一个参数:会被选中的概率
		 * 第二个参数:选取的样本数
		 * 第三个参数:最大读取InputSplit数
		 * 
		 * */
		RandomSampler<Text, Text> sampler = new InputSampler.RandomSampler<Text,Text>(0.1, 10000, 10);	
		Configuration conf = new Configuration();	
		TotalOrderPartitioner.setPartitionFile(conf, partitionFile);

		Job job = new Job(conf);
		job.setJobName("TotalSort");
		job.setJarByClass(TotalSort.class);
		//数据文件默认以\t分割
		job.setInputFormatClass(KeyValueTextInputFormat.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setNumReduceTasks(reduceNumer);	
		job.setPartitionerClass(TotalOrderPartitioner.class);
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		//如果输出目录存在则删除
		outputPath.getFileSystem(conf).delete(outputPath,true);	
		InputSampler.writePartitionFile(job, sampler);
		System.out.println(job.waitForCompletion(true)?0:1);	
	}
}

四、合并多个小文件

        在前面的文章中我们就说过,hdfs用于存储大的数据集,如果小文件的数量太多,是对资源的一种浪费。那么,我们可以使用CombineTextInputFormat.setMaxInputSplitSize设置切片的大小,将小文件合并为大文件。

package com.hadoop.combinefile;

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SmallFileCombinerMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
	
	NullWritable v = NullWritable.get();
	protected void map(LongWritable key,Text vakue,Context context) throws IOException, InterruptedException{

		context.write(vakue, v);
	}
}
package com.hadoop.combinefile;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Driver {

	public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();	
		Job job = new Job(conf,"smallFileCombine");
		job.setJarByClass(Driver.class);
		job.setMapperClass(SmallFileCombinerMapper.class);	
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		//将小文件划分为一个切片
		job.setInputFormatClass(CombineTextInputFormat.class);
		//设置切片的大小,如果多个文件总和小于150M则会被合并
		CombineTextInputFormat.setMaxInputSplitSize(job, 1024*1024*150);
		CombineTextInputFormat.setInputPaths(job, new Path("hdfs://10.128.0.130:9000/user/root/CombineInput"));
		
		FileOutputFormat.setOutputPath(job, new Path("hdfs://10.128.0.130:9000/user/root/output9"));
		job.setNumReduceTasks(0);
		System.out.println("OK");
		System.out.println(job.waitForCompletion(true) ? 0 : 1);
	}
}

五、数据去重

        数据去重的目的是纬二路去除文件中重复的数据,在mapreduce中我们可以利用map的输出经过shuffle过程聚集成<key,value-list>的形式。因此,我们只需要将map阶段的value输出为key,value任意;在reduce阶段只将key输出,value置为空就可以了。

package com.hadoop.DeWeighting;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class DeMap extends Mapper<Object, Text, Text, Text>{
	
	public void map(Object key,Text value,Context context) throws IOException, InterruptedException {	
		context.write(value, new Text(""));		
	}
}
package com.hadoop.DeWeighting;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class DeReduce extends Reducer<Text, Text, Text, Text>{
	
	public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
		context.write(key, new Text(""));
	}
}
package com.hadoop.DeWeighting;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Driver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		Job job = new Job(conf,"DeWeight");
		job.setJarByClass(Driver.class);
		job.setMapperClass(DeMap.class);	
		job.setReducerClass(DeReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path("hdfs://10.128.0.130:9000/user/root/DeWeighting"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://10.128.0.130:9000/user/root/output10"));
		System.out.println("OK");
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}



猜你喜欢

转载自blog.csdn.net/u011380972/article/details/80904527