第2節のMapReduce深度調査:16を達成するため、マップ側は、アルゴリズムに参加します

参加アルゴリズムは、マップ、テーブルには小さなテーブルのために大規模な参加、すべての小時間データテーブルは、メモリにロードされ終了します。

 

コード:

 

MapJoinMain:
package cn.itcast.demo5.mapJoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class MapJoinMain extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {

Configuration conf = this.getConf();

DistributedCache.addCacheFile(new URI("hdfs://node01:8020/product_cache/pdts.txt"),conf);

Job job = Job.getInstance(conf, MapJoinMain.class.getSimpleName());
// job.setJarByClass(MapJoinMain.class);

job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///D:\\Study\\BigData\\heima\\stage2\\4、大数据离线第四天\\map端join\\map_join_input"));

job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///D:\\Study\\BigData\\heima\\stage2\\4、大数据离线第四天\\map端join\\map_join_output"));

boolean b = job.waitForCompletion(true);
return b?0:1;
}

public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new MapJoinMain(), args);
System.exit(run);
}

}

MapJoinMapper:
package cn.itcast.demo5.mapJoin;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class MapJoinMapper extends Mapper<LongWritable,Text,Text,Text> {

Map<String,String> map = new HashMap<String,String>();

/**
* 初始化的方法,最开始的时候调用一次;在这个方法里面可以获取到我们的缓存文件
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
//获取缓存文件列表
URI[] cacheFiles = DistributedCache.getCacheFiles(conf);
//获取分布式文件系统
FileSystem fileSystem = FileSystem.get(cacheFiles[0], conf);
//把分布式缓存的文件读成一个流
FSDataInputStream inputStream = fileSystem.open(new Path(cacheFiles[0]));
//通过BufferedReader来读取我们的输入流
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));

String line = null;
while ((line = reader.readLine()) != null){
//p0001,小米5,1000,2000
//往下读一行
String[] split = line.split(",");
map.put(split[0],line);
}

IOUtils.closeQuietly(inputStream);
IOUtils.closeQuietly(reader);
fileSystem.close();
}

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//value.toString() 订单数据 1001,20150710,p0001,2
String line = value.toString();
String[] split = line.split(",");
if(line != null && !"".equals(line)){
String pid = split[2];
//map.get(pid) 产品数据 p0001,xiaomi,1000,2
String v2 = map.get(pid) + "\t" + line;
context.write(new Text(pid),new Text(v2));
}
}
}

おすすめ

転載: www.cnblogs.com/mediocreWorld/p/11029115.html