MapReduce（3）

一、Mapper分区数量

1.Mapper 分区数量 = 文件大小 / 128M

2.输出结果数量 = Reducer 的数量

二、设置 Reducer 数量

计算每月利润总和

源文件：
1 280
1 560
2 234
2 264
3 873
3 2323

/*
 * 1.Map任务的数量问题，Map任务取决切面的数量，Spilts。切片里封装了处理的数据长度以及处理数据位置所在信息。
 * 2.切片的数量取决于：文件总大小/BlockSize =128MB
 * 假如一个文件是129MB>128MB，最后是2个切片=》两个map任务
 * 
 * 3.Reduce任务数量，Hadoop默认就是1个Reduce。此外，结果文件的数量=Reduce任务数量
 * 4.分区的概念，Hadoop默认分区用的是HashPartioner,根据Mapper输出的key的hashcode进行分区。
 * 
 */
public class ProfitMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

	@Override
	protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		String [] datas = line.split(" ");
		context.write(new Text(datas[0]), new IntWritable(Integer.valueOf(datas[1])));
	}
}

public class ProfitRecducer extends Reducer<Text, IntWritable, Text, IntWritable> {

	@Override
	protected void reduce(Text arg0, Iterable<IntWritable> arg1,
			Reducer<Text, IntWritable, Text, IntWritable>.Context arg2) throws IOException, InterruptedException {
		Integer sum = 0 ;
		for(IntWritable intw :arg1){
			sum = sum + intw.get();
			
		}
		arg2.write(arg0, new IntWritable(sum));
			
	}
}

public class ProfitDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		// 配置文件
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		// 设置运行入口，Mapper，Reducer 
		job.setJarByClass(ProfitDriver.class);
		job.setMapperClass(ProfitMapper.class);
		job.setReducerClass(ProfitRecducer.class);
		
		job.setNumReduceTasks(3);
		
		// 设置Mapper 的key value 类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		// 设置Reducer 的 key value 类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		// 设置文件的读入 输出目录
		FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.76.131:9000/profit"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.76.131:9000/profit/result1"));
		
		job.waitForCompletion(true);

	}

}

运行 run as --> hadoop

结果：
在 result1 文件夹内有三个结果文件：
是根据 Map 的key 的hashCode 进行分配运行

三、自定义分区

public class ProfitPartioner extends Partitioner<Text, IntWritable>{

	@Override
	public int getPartition(Text key, IntWritable value, int numCount) {
		// 默认实现
//		key.hashCode() & Integer.MAX_VALUE % numCount;
		
		// 当前 reduce num = 3 ，三个分区 0 1 2 
		if (key.toString().equals("1") || key.toString().equals("2") ){
			return 0 ;
		}else if( key.toString().equals("3")){
			return 1 ; 
		}else{
			return 2;
		}
	}

}

package com.study.profit.day01;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ProfitDriver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		// 配置文件
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		
		// 设置运行入口，Mapper，Reducer 
		job.setJarByClass(ProfitDriver.class);
		job.setMapperClass(ProfitMapper.class);
		job.setReducerClass(ProfitRecducer.class);
		
		job.setNumReduceTasks(3);
		// 设置自定义分区类
		job.setPartitionerClass(ProfitPartioner.class);
		
		
		// 设置Mapper 的key value 类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		// 设置Reducer 的 key value 类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		// 设置文件的读入 输出目录
		FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.76.131:9000/profit/profit.txt"));
		FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.76.131:9000/profit/result4"));
		
		job.waitForCompletion(true);

	}

}

运行：result 三个结果
1 2 分区有结果 3 分区无内容

即：1 与 2 的结果在 1 分区； 3的结果在 2分区；3分区无内容

备注：Reducer 先分区再按照key合并

猜你喜欢