统计一个文本的词频
package Test01;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static void main(String[] args){
try{
//获取配置对象
Configuration conf = new Configuration();
//对conf设置
//获取job
Job job = Job.getInstance(conf, "WordCount02");
//对job设置运行主类
job.setJarByClass(WordCount.class);
//对job的map端属性设置
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//对job的reduce端属性设置
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置job的输入路径和输出路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交作业
int success = job.waitForCompletion(true) ? 0: 1;
//退出
System.exit(success);
}
catch (Exception e){
e.printStackTrace();
}
}
//自定义Mapper类
public static class MyMapper extends Mapper<Object, Text, Text, LongWritable> {
public Text k = new Text();
public LongWritable v = new LongWritable(1L);
//map函数,map阶段的核心业务逻辑
@Override
protected void map(Object key, Text value, Context context){
try{
//获取行值
String row = value.toString();//map中每次读取一行
//拆分行值
String[] words = row.split(" ");
for(String st: words){
String st2 = "";
char[] ct = st.toCharArray();
if ((ct[st.length()-1] >= 'a' && ct[st.length()-1] <= 'z') || (ct[st.length()-1] >= 'A' && ct[st.length()-1] <= 'Z')){
st2 = String.valueOf(ct);
}
else {
for(int i = 0; i < st.length()-1; i++){
st2 += String.valueOf(ct[i]);
}
}
k.set(st2);
context.write(k, v);
}
}
catch (Exception e){
e.printStackTrace();
}
}
}
//自定义reducer类
public static class MyReducer extends Reducer<Text, LongWritable, Text, IntWritable> {
public IntWritable v = new IntWritable();
//reduce函数,reduce阶段的核心业务逻辑
@Override
protected void reduce(Text key, Iterable<LongWritable> value, Context context){
try{
//定义计数器
int cnt = 0;
for(LongWritable i: value){
cnt += i.get();
}
v.set(cnt);
//reduce的输出
context.write(key, v);
}
catch(Exception e){
e.printStackTrace();
}
}
}
}
(1)点开IDEA右边的maven,依次点击 Lifecycle - > clean(右键)-> Run Maven Build
(2)依次点击 install(右键)-> Run Maven Build
(3)将target下的jar包发送到Hadoop服务器
(4)输入命令运行
yarn jar MapReducePractice-1.0-SNAPSHOT.jar Test01.WordCount /test/input/t1.txt /test/output/01
(5)查看运行结果
[hadoop@hadoop105 ~]$ hdfs dfs -cat /test/output/01/part-r-00000
Dream 1
Hello 2
I 1
Love 1
World 1
You 2
are 1
hope 1
my 1