Map Reduce
Map Reduce是一种编程模型,用于大规模数据集(大于1TB)的并行运算。 概念"Map(映射)"和"Reduce(归约)",是它们的主要思想,都是从函数式编程(数据不动代码动)语言里借来的,还有从矢量编程(分阶段对任务划分,每个阶段实现并行)语言里借来的特性。概念"Map(映射)"和"Reduce(归约)",是它们的主要思想,都是从函数式编程语言里借来的,还有从矢量编程语言里借来的特性。
MapReduce
是Hadoop的一个并行计算框架,将一个计算任务拆分成为两个阶段分别是Map阶段和Reduce阶段.Map Reduce计算框架充分利用了存储节点(datanode)所在的物理主机的计算资源(内存/CPU/网络/少许磁盘)进行并行计算.MapReduce框架会在所有的存储节点上分别启动一个Node Manager进程实现对存储节点的计算资源的管理和使用.默认情况下Node Manager会将本进程运行的物理主机的计算资源抽象成8个计算单元,每个单元称为一个Container
,所有Node Manager都必须听从Resource Manager调度.Resource Manager负责计算资源的统筹分配.
==Resource Manager==:统筹计算资源,管理所有NodeManager,进行资源分配
==Node Manager==:管理物理主机上的计算资源Container
,负责向RM汇报自身状态信息
==MRAppMaster==:计算任务的Master,负责申请计算资源,协调计算任务.
==YarnChild==:负责做实际计算的任务/进程(MapTask/ReduceTask)
==Container==:是计算资源的抽象代表着一组内存/cpu/网路的占用.无论是MRAppMaster还是YarnChild运行时都需要消耗一个Container逻辑.
YARN环境搭建
配置文件
etc/hadoop/yarn-site.xml
[root@CentOS ~]# cd /usr/hadoop-2.6.0/
[root@CentOS hadoop-2.6.0]# ls
bin hadoop-root lib LICENSE.txt NOTICE.txt sbin
etc include libexec logs README.txt share
[root@CentOS ~]# vi etc/hadoop/yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!--Resource Manager-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>CentOS</value>
</property>
`etc/hadoop/mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
启动计算服务
[root@CentOS ~]# start-yarn.sh
[root@CentOS ~]# jps
11459 NameNode
11575 DataNode
11722 SecondaryNameNode
18492 ResourceManager
18573 NodeManager
访问:http://centos:8088/
代表你成功了
Map Reduce HelloWord
ip userid productid oper_type stay_time date
-------------------------------------------------
192.168.0.12 1 001 click 5000 2019-01-04 14:44:00
192.168.0.12 1 001 add_card 5000 2019-01-04 14:44:00
192.168.0.13 2 003 click 5000 2019-01-04 14:44:00
192.168.0.11 3 001 click 5000 2019-01-04 14:44:00
select ip,sum(1) from t_access group by ip;
聚合 分组
Reduce(ip,Int[]{1,1,1,...}) Map函数->(ip,1)
Maven依赖
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.6.0</version>
</dependency>
Mapper逻辑
public class IpMapper extends Mapper<LongWritable,Text,Text, IntWritable> {
/**
*192.168.0.12 1 001 click 5000 2019-01-04 14:44:00
* @param key :输入文本行字节偏移量
* @param value:输入文本行
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] tokens = value.toString().split(" ");
String ip=tokens[0];
context.write(new Text(ip),new IntWritable(1));
}
}
Reducer逻辑
public class IpReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
/**
*
* @param key :ip
* @param values: Int[]{1,1,1,..}
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int total=0;
for (IntWritable value : values) {
total+=value.get();
}
context.write(key,new IntWritable(total));
}
}
*job封装
public class CustomJobSubmiter extends Configured implements Tool {
public int run(String[] args) throws Exception {
//1.封装job对象
Configuration conf=getConf();
Job job=Job.getInstance(conf);
//2.设置数据读入和写出格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//3.设置处理数据路径
Path src=new Path("/demo/access");
TextInputFormat.addInputPath(job,src);
Path dst=new Path("/demo/result");
TextOutputFormat.setOutputPath(job,dst);
//4.设置数据计算逻辑
job.setMapperClass(IpMapper.class);
job.setReducerClass(IpReducer.class);
//5.设置Mapper和Reducer输出泛型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//6.提交任务
job.submit();
return 0;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new CustomJobSubmiter(),args);
}
}
发布一个任务
1.远程jar包部署(最常用的)*
Hadoop_MapReduce-1.0-SNAPSHOT.jar上传
[root@CentOS ~]# ls
hadoop-2.6.0_x64.tar.gz Hadoop_MapReduce-1.0-SNAPSHOT.jar #上传jar包
[root@CentOS ~]# vi t_access #把数据放入里面
[root@CentOS ~]# hdfs dfs -put /root/t_access /demo/access #推向远程
[root@CentOS ~]# jps #查看
5043 ResourceManager
4900 SecondaryNameNode
6101 Jps
5131 NodeManager
4637 NameNode
4751 DataNode
jar com.baizhi.demo2.CustomJobSubmiter #远端运行
public class CustomJobSubmiter extends Configured implements Tool {
public int run(String[] args) throws Exception {
//1.封装job对象
Configuration conf=getConf();
Job job=Job.getInstance(conf);
//设置jar类加载器,否则MapReduce框架找不到Mapper和Reducer
job.setJarByClass(CustomJobSubmiter.class);//加入这段代码
//2.设置数据读入和写出格式
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//3.设置处理数据路径
Path src=new Path("/demo/access");
TextInputFormat.addInputPath(job,src);
Path dst=new Path("/demo/result");
TextOutputFormat.setOutputPath(job,dst);
//4.设置数据计算逻辑
job.setMapperClass(IpMapper.class);
job.setReducerClass(IpReducer.class);
//5.设置Mapper和Reducer输出泛型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//6.提交任务
job.submit();
return 0;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new CustomJobSubmiter(),args);
}
}