MapReduce入门:11、mapreduce程序的入门-2

1.5、WordCount示例编写

看到12节,37.28.

 1、JobMain.java类

package cn.itcast.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class JobMain extends Configured implements Tool {

@Override
public int run(String[] args) throws Exception {
//获取一个job对象,用于任务的组织,通过job对象将八个步骤组织到一起,提交给yarn集群运行
Job job = Job.getInstance(this.getConf(), "xxx");

//获取到job对象后,通过job对象组织8个class类到一起,然后提交给yarn集群运行即可

//1.读取文件,解析成k,v对,这里是k1,v1
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/wordcount"));
//2.自定义map逻辑,接收第一步的k1,v1对,转换成新的k2,v2对,进行输出
job.setMapperClass(WordCountMapper.class);
//设置k2类型
job.setMapOutputKeyClass(Text.class);
//设置v2类型
job.setMapOutputValueClass(IntWritable.class);
/**
* 第3-6步都省略:分区,排序,规约,分组
*/

//7.设置reduce类,接收k2 v2对,输出k3 v3对
job.setReducerClass(WordCountReducer.class);
//设置k3输出类型
job.setOutputKeyClass(Text.class);
//设置v3输出类型
job.setOutputValueClass(IntWritable.class);
//8.设置输出类outputFormat
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/wordcountout"));

boolean b = job.waitForCompletion(true);
return b?0:1;
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//提交job任务
//任务完成后,返回一个状态码值,如果该值为0,表示运行成功
int run = ToolRunner.run(conf,new JobMain(),args);
System.out.println("状态码是:"+run);
System.exit(run);
}
}

2、WordCountMapper类:

package cn.itcast.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
* 该类继承Mapper,表示它是一个标准的mapper类,需要4个泛型
* k1 v1 k2 v2
*/

public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
private Text text = new Text();
private IntWritable intWritable = new IntWritable();

/**
* 覆写父类的map方法,每一行数据会调用一次map方法,处理逻辑都在该方法中
*
* hdfs文件的最原始的数据:
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop

经过第一步:TextInputFormat之后
0 hello,world,hadoop
18 hive,sqoop,flume,hello
40 kitty,tom,jerry,world
61 hadoop
* @param key 我们的key1 行偏移量 ,一般没啥用,直接可以丢掉
* @param value 我们的value1 行文本内容,需要切割,然后转换成新的k2 v2 输出
* @param context 上下文对象,承接上文,把数据传输给下文
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(",");
//遍历我们切割出来的单词
for (String word:words){
text.set(word);
intWritable.set(1);
//写出我们的k2 v2 这里的类型跟我们的k2 v2 保持一致
context.write(text,intWritable);
}
}
}

3、WordCountReducer类:

package cn.itcast.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
* 自定义的class类,继承Reducer类,表明它是一个标准的reducer类
* 跟我们的k2 v2 k3 v3 四个泛型
*/
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable intWritable = new IntWritable();
/**
* 覆写reduce方法,
* @param key 接收的key 是我们的K2
* @param values 接收到value是一个集合 集合里面的数据类型是 v2 类型
* @param context 上下文对象,将我们的数据往外写
* @throws IOException
* @throws InterruptedException
*/
/*
hello,world,hadoop
hive,sqoop,flume,hello
kitty,tom,jerry,world
hadoop

hello <1,1>
world <1,1>
hadoop <1,1>
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int a = 0;
for (IntWritable value:values){
int i = value.get();
a+=i;
}
intWritable.set(a);
//将数据写出去
context.write(key,intWritable);
}
}

猜你喜欢

转载自www.cnblogs.com/mediocreWorld/p/10970920.html